fix: fixed CSM-004

This commit is contained in:
2026-05-08 19:52:59 +02:00
parent 723b8c6433
commit 9944b70ab1
4 changed files with 56 additions and 19 deletions

View File

@@ -72,18 +72,17 @@
--- ---
### CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner ### ~~CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner~~ ✅ FIXED
- **File:** `lib/bds/scripting/job_runner.ex:28-35` - **Fixed:** 2026-05-08
- **What:** - **What was done:**
1. Synchronous `GenServer.call` to `JobStore.attach_runner/2` inside `init/1` (Z. 30) blocks the supervisor if `JobStore` is slow. - Moved `JobStore.attach_runner/2` from `init/1` to a new `handle_continue(:attach_and_start)` callback, so supervisor startup is no longer blocked by the synchronous call.
2. `Process.flag(:trap_exit, true)` (Z. 29) is set but there is **no** `terminate/2` callback and **no** `{:EXIT, ...}` handler in `handle_info`. - Added `terminate/2` callback that calls `JobStore.detach_runner/2` (with `try/catch` for shutdown safety), centralizing cleanup that was previously scattered across individual exit paths.
- **Why it's bad:** - Added `handle_info({:EXIT, _pid, _reason})` clause to handle trapped exit signals from linked processes.
- Supervisor child startup hangs if `JobStore` is slow or unavailable. - Removed redundant inline `detach_runner` calls from `handle_call(:cancel)`, task result handler, and `:DOWN` handler — `terminate/2` now handles all detach cleanup.
- If the runner process is killed (not via `:cancel`), the PID leaks in `JobStore.runners`. The `detach_runner/2` call only happens in `handle_call(:cancel)` (Z. 71) and `handle_info({ref, result})` (Z. 95) and `handle_info({:DOWN, ...})` (Z. 117). A linked process crash with `trap_exit` would send `{:EXIT, pid, reason}` which has **no handler**, so the runner stops without detaching. - Changed `restart: :temporary` since job runners are one-shot processes that should not auto-restart on failure.
- **Fix:** - Added `@impl true` to all `handle_info` clauses.
- Move `attach_runner` to `handle_continue/2`. - Fixed pre-existing bug in `JobStore.detach_runner` handler where `update_in/2` macro result was incorrectly double-wrapped, corrupting state.
- Either remove `trap_exit` (the task uses `async_nolink`, so links aren't the issue) OR add `terminate/2` that calls `JobStore.detach_runner/2` and add an `{:EXIT, ...}` clause in `handle_info`. - Added test: start a runner, kill it externally (not via cancel), assert `JobStore` no longer contains the dead PID.
- **Test:** Start a runner, kill it (not via cancel); assert `JobStore` does not contain the dead PID.
--- ---
@@ -400,6 +399,7 @@
- [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification. - [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification.
- CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`. - CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`.
- CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs. - CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs.
- CSM-004: Fixed. `attach_runner` moved to `handle_continue`, `terminate/2` added for cleanup, `restart: :temporary` set, JobStore `detach_runner` bug fixed.
- [ ] All high-severity items (CSM-006 to CSM-010) have been addressed. - [ ] All high-severity items (CSM-006 to CSM-010) have been addressed.
- [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`. - [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`.
- [x] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations). - [x] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations).

View File

@@ -1,7 +1,7 @@
defmodule BDS.Scripting.JobRunner do defmodule BDS.Scripting.JobRunner do
@moduledoc false @moduledoc false
use GenServer use GenServer, restart: :temporary
def start_link(opts) do def start_link(opts) do
GenServer.start_link(__MODULE__, opts) GenServer.start_link(__MODULE__, opts)
@@ -27,8 +27,13 @@ defmodule BDS.Scripting.JobRunner do
} }
Process.flag(:trap_exit, true) Process.flag(:trap_exit, true)
{:ok, state, {:continue, :attach_and_start}}
end
@impl true
def handle_continue(:attach_and_start, state) do
:ok = BDS.Scripting.JobStore.attach_runner(state.job_id, self()) :ok = BDS.Scripting.JobStore.attach_runner(state.job_id, self())
{:ok, state, {:continue, :start_job}} {:noreply, state, {:continue, :start_job}}
end end
@impl true @impl true
@@ -68,7 +73,6 @@ defmodule BDS.Scripting.JobRunner do
finished_at: DateTime.utc_now() finished_at: DateTime.utc_now()
}) })
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
{:stop, :normal, :ok, %{state | cancelled?: true}} {:stop, :normal, :ok, %{state | cancelled?: true}}
end end
@@ -78,6 +82,7 @@ defmodule BDS.Scripting.JobRunner do
{:noreply, state} {:noreply, state}
end end
@impl true
def handle_info({ref, result}, %{task_ref: ref} = state) do def handle_info({ref, result}, %{task_ref: ref} = state) do
Process.demonitor(ref, [:flush]) Process.demonitor(ref, [:flush])
@@ -92,12 +97,12 @@ defmodule BDS.Scripting.JobRunner do
end end
:ok = BDS.Scripting.JobStore.update_job(state.job_id, attrs) :ok = BDS.Scripting.JobStore.update_job(state.job_id, attrs)
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
end end
{:stop, :normal, %{state | completed?: true}} {:stop, :normal, %{state | completed?: true}}
end end
@impl true
def handle_info({:DOWN, ref, :process, _pid, reason}, %{task_ref: ref} = state) do def handle_info({:DOWN, ref, :process, _pid, reason}, %{task_ref: ref} = state) do
cond do cond do
state.completed? or state.cancelled? -> state.completed? or state.cancelled? ->
@@ -114,8 +119,21 @@ defmodule BDS.Scripting.JobRunner do
finished_at: DateTime.utc_now() finished_at: DateTime.utc_now()
}) })
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
{:stop, :normal, state} {:stop, :normal, state}
end end
end end
@impl true
def handle_info({:EXIT, _pid, _reason}, state) do
{:stop, :normal, state}
end
@impl true
def terminate(_reason, state) do
try do
BDS.Scripting.JobStore.detach_runner(state.job_id)
catch
:exit, _ -> :ok
end
end
end end

View File

@@ -65,8 +65,7 @@ defmodule BDS.Scripting.JobStore do
end end
def handle_call({:detach_runner, job_id}, _from, state) do def handle_call({:detach_runner, job_id}, _from, state) do
next_state = update_in(state.runners, &Map.delete(&1, job_id)) {:reply, :ok, %{state | runners: Map.delete(state.runners, job_id)}}
{:reply, :ok, %{state | runners: next_state}}
end end
def handle_call({:fetch_job, job_id}, _from, state) do def handle_call({:fetch_job, job_id}, _from, state) do

View File

@@ -90,6 +90,26 @@ defmodule BDS.Scripting.JobTest do
assert cancelled_job.finished_at != nil assert cancelled_job.finished_at != nil
end end
test "killing a runner detaches it from JobStore (CSM-004)" do
Application.put_env(:bds, :scripting,
runtime: BlockingRuntime,
timeout: 300_000,
max_reductions: 5_000_000,
job_timeout: :infinity,
job_max_reductions: :none
)
assert {:ok, job} = BDS.Scripting.start_job("irrelevant", "main")
_running_job = wait_for_job(job.id, &(&1.status == :running))
runner_pid = BDS.Scripting.JobStore.runner_for(job.id)
assert is_pid(runner_pid)
GenServer.stop(runner_pid, :shutdown)
Process.sleep(50)
assert BDS.Scripting.JobStore.runner_for(job.id) == nil
end
defp wait_for_job(job_id, predicate, attempts \\ 50) defp wait_for_job(job_id, predicate, attempts \\ 50)
defp wait_for_job(job_id, predicate, attempts) when attempts > 0 do defp wait_for_job(job_id, predicate, attempts) when attempts > 0 do