fix: implemented TD-07, chat await path with deadline

2026-06-12 12:08:27 +02:00
parent 2e633922f9
commit 66938c23f2
6 changed files with 124 additions and 6 deletions
--- a/TECHDEBTS.md
+++ b/TECHDEBTS.md
@@ -312,7 +312,20 @@ airplane mode / local model / toast.
 events arrive before the final `{:ok, reply}`; tool-call rounds still work;
 cancellation mid-stream (TD-07) aborts the HTTP request.

-### TD-07: Bound the chat await chain; end-to-end timeout & cancellation
+### TD-07: Bound the chat await chain; end-to-end timeout & cancellation ✅ DONE (2026-06-12)
+
+**Status: implemented.** `BDS.AI.Chat.send_chat_message/3` no longer waits
+unboundedly on the supervised chat task: `await_chat_task/2` now applies a
+global deadline derived from the configured per-request HTTP budget and the
+bounded tool loop (`BDS.AI.HttpClient.request_timeout_ms() *
+(chat_max_tool_rounds + 1) + config :bds, :chat, :await_timeout_margin_ms`). On
+deadline expiry it returns `{:error, :chat_timeout}` and shuts the task down via
+`Task.shutdown/2`, so the caller is released even if the runtime wedges.
+`config/config.exs` now exposes `:await_timeout_margin_ms` under `:chat`. The
+acceptance proof is a shutdown-aware blocking runtime test that asserts the
+timeout result, verifies the task receives shutdown, and confirms the
+conversation persists only the user message on timeout; existing cancellation
+and streaming tests remain green.

 **Context.** `BDS.AI.Chat.send_chat_message/3` blocks the caller (a LiveView
 process) on a hand-rolled `await_chat_task/1` — a raw `receive` with **no
--- a/config/config.exs
+++ b/config/config.exs
@@ -70,10 +70,13 @@ config :bds, :scripting,
 # streaming: chat completions use SSE when the provider supports it (set to
 # false for OpenAI-compatible servers that reject the "stream" flag).
 # stream_emit_interval_ms throttles how often streamed content reaches the UI.
+# await_timeout_margin_ms is added on top of the per-request HTTP budget across
+# the bounded tool-call loop, so the caller never waits forever.
 config :bds, :chat,
  max_tool_rounds: 10,
  streaming: true,
-  stream_emit_interval_ms: 100
+  stream_emit_interval_ms: 100,
+  await_timeout_margin_ms: 5_000

 config :bds, :embeddings,
  backend: BDS.Embeddings.Backends.Neural,
--- a/lib/bds/ai/chat.ex
+++ b/lib/bds/ai/chat.ex
@@ -27,6 +27,7 @@ defmodule BDS.AI.Chat do
  @title_max_output_tokens 256
  @chat_title_max_length 30
  @chat_max_tool_rounds 10
+  @chat_await_timeout_margin_ms 5_000
  @default_context_window 128_000

  @spec start_chat(map()) :: {:ok, map()} | {:error, Ecto.Changeset.t()}
@@ -208,7 +209,7 @@ defmodule BDS.AI.Chat do
      send(task.pid, :sandbox_ready)

      try do
-        await_chat_task(task)
+        await_chat_task(task, chat_await_timeout_ms())
      after
        InFlight.unregister(conversation.id)
      end
@@ -756,9 +757,20 @@ defmodule BDS.AI.Chat do
  # BoundedToolLoop: the tool-calling round count is capped by
  # config.chat_max_tool_rounds (falling back to the built-in default).
  defp chat_max_tool_rounds do
+    chat_config(:max_tool_rounds, @chat_max_tool_rounds)
+  end
+
+  defp chat_await_timeout_ms do
+    per_request_timeout_ms = BDS.AI.HttpClient.request_timeout_ms()
+
+    per_request_timeout_ms * (chat_max_tool_rounds() + 1) +
+      chat_config(:await_timeout_margin_ms, @chat_await_timeout_margin_ms)
+  end
+
+  defp chat_config(key, default) do
    :bds
    |> Application.get_env(:chat, [])
-    |> Keyword.get(:max_tool_rounds, @chat_max_tool_rounds)
+    |> Keyword.get(key, default)
  end

  defp chat_system_prompt(project_id, tools) do
@@ -853,7 +865,7 @@ defmodule BDS.AI.Chat do
    :ok
  end

-  defp await_chat_task(task) do
+  defp await_chat_task(task, timeout_ms) do
    ref = task.ref

    receive do
@@ -879,6 +891,10 @@ defmodule BDS.AI.Chat do
          _other ->
            {:error, :cancelled}
        end
+    after
+      timeout_ms ->
+        _ = Task.shutdown(task, 100)
+        {:error, :chat_timeout}
    end
  end

--- a/lib/bds/ai/http_client.ex
+++ b/lib/bds/ai/http_client.ex
@@ -25,6 +25,14 @@ defmodule BDS.AI.HttpClient do
  @default_get_max_retries 2
  @default_retry_delay_ms 500

+  @spec request_timeout_ms() :: pos_integer()
+  def request_timeout_ms do
+    max(
+      config(:connect_timeout_ms, @default_connect_timeout_ms),
+      config(:receive_timeout_ms, @default_receive_timeout_ms)
+    )
+  end
+
  @spec get(String.t(), %{String.t() => String.t()}) ::
          {:ok, %{status: non_neg_integer(), headers: map(), body: binary()}} | {:error, term()}
  def get(url, headers) when is_binary(url) and is_map(headers) do
--- a/test/bds/ai_test.exs
+++ b/test/bds/ai_test.exs
@@ -279,6 +279,24 @@ defmodule BDS.AITest do
    end
  end

+  defmodule ShutdownAwareBlockingRuntime do
+    def generate(endpoint, request, opts) do
+      Process.flag(:trap_exit, true)
+
+      test_pid = Keyword.fetch!(opts, :test_pid)
+      send(test_pid, {:blocking_runtime_started, endpoint, request, self()})
+
+      receive do
+        {:EXIT, _from, :shutdown} ->
+          send(test_pid, :blocking_runtime_shutdown)
+          exit(:shutdown)
+      after
+        5_000 ->
+          {:ok, %{content: "too late", usage: %{input_tokens: 1, output_tokens: 1}}}
+      end
+    end
+  end
+
  # Always returns another tool call and never a final answer, so a chat would
  # loop forever if the round count were not bounded.
  defmodule LoopingToolRuntime do
@@ -1772,6 +1790,64 @@ defmodule BDS.AITest do
    assert Enum.map(messages, & &1.role) == [:user]
  end

+  @tag :chat_timeout
+  test "send_chat_message times out a stalled chat round and keeps persisted state consistent" do
+    original_chat_config = Application.get_env(:bds, :chat, [])
+    original_http_config = Application.get_env(:bds, BDS.AI.HttpClient, [])
+
+    Application.put_env(
+      :bds,
+      :chat,
+      original_chat_config
+      |> Keyword.put(:max_tool_rounds, 1)
+      |> Keyword.put(:await_timeout_margin_ms, 25)
+    )
+
+    Application.put_env(
+      :bds,
+      BDS.AI.HttpClient,
+      original_http_config
+      |> Keyword.put(:connect_timeout_ms, 50)
+      |> Keyword.put(:receive_timeout_ms, 50)
+    )
+
+    on_exit(fn ->
+      Application.put_env(:bds, :chat, original_chat_config)
+      Application.put_env(:bds, BDS.AI.HttpClient, original_http_config)
+    end)
+
+    assert {:ok, _endpoint} =
+             BDS.AI.put_endpoint(
+               :online,
+               %{
+                 url: "https://api.example.test/v1",
+                 api_key: "online-secret",
+                 model: "gpt-4o-mini"
+               },
+               secret_backend: FakeSecretBackend
+             )
+
+    assert {:ok, conversation} = BDS.AI.start_chat(%{model: "gpt-4o-mini"})
+
+    started_at = System.monotonic_time(:millisecond)
+
+    assert {:error, :chat_timeout} =
+             BDS.AI.send_chat_message(conversation.id, "Please wait forever",
+               runtime: ShutdownAwareBlockingRuntime,
+               test_pid: self(),
+               secret_backend: FakeSecretBackend
+             )
+
+    elapsed_ms = System.monotonic_time(:millisecond) - started_at
+
+    assert elapsed_ms < 1_000
+    assert_receive {:blocking_runtime_started, _endpoint, %{operation: :chat}, _pid}, 500
+    assert_receive :blocking_runtime_shutdown, 500
+
+    messages = BDS.AI.list_chat_messages(conversation.id)
+    assert Enum.map(messages, & &1.role) == [:user]
+  end
+
  test "get_surface_state and put_surface_state persist and restore surface UI state" do
    assert {:ok, conversation} = BDS.AI.start_chat(%{title: "Surface State", model: "gpt-4.1"})

--- a/test/bds/wxr_parser_test.exs
+++ b/test/bds/wxr_parser_test.exs
@@ -1,5 +1,5 @@
 defmodule BDS.WxrParserTest do
-  use ExUnit.Case, async: true
+  use ExUnit.Case, async: false

  alias BDS.WxrParser

@@ -102,6 +102,8 @@ defmodule BDS.WxrParserTest do
    </rss>
    """

+    _warmup = WxrParser.parse_xml(sample_wxr_xml())
+
    atom_count_before = :erlang.system_info(:atom_count)
    parsed = WxrParser.parse_xml(xml)
    atom_count_after = :erlang.system_info(:atom_count)