diff --git a/CODESMELL.md b/CODESMELL.md index 4bd423f..53a456d 100644 --- a/CODESMELL.md +++ b/CODESMELL.md @@ -72,18 +72,17 @@ --- -### CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner -- **File:** `lib/bds/scripting/job_runner.ex:28-35` -- **What:** - 1. Synchronous `GenServer.call` to `JobStore.attach_runner/2` inside `init/1` (Z. 30) blocks the supervisor if `JobStore` is slow. - 2. `Process.flag(:trap_exit, true)` (Z. 29) is set but there is **no** `terminate/2` callback and **no** `{:EXIT, ...}` handler in `handle_info`. -- **Why it's bad:** - - Supervisor child startup hangs if `JobStore` is slow or unavailable. - - If the runner process is killed (not via `:cancel`), the PID leaks in `JobStore.runners`. The `detach_runner/2` call only happens in `handle_call(:cancel)` (Z. 71) and `handle_info({ref, result})` (Z. 95) and `handle_info({:DOWN, ...})` (Z. 117). A linked process crash with `trap_exit` would send `{:EXIT, pid, reason}` which has **no handler**, so the runner stops without detaching. -- **Fix:** - - Move `attach_runner` to `handle_continue/2`. - - Either remove `trap_exit` (the task uses `async_nolink`, so links aren't the issue) OR add `terminate/2` that calls `JobStore.detach_runner/2` and add an `{:EXIT, ...}` clause in `handle_info`. -- **Test:** Start a runner, kill it (not via cancel); assert `JobStore` does not contain the dead PID. +### ~~CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner~~ ✅ FIXED +- **Fixed:** 2026-05-08 +- **What was done:** + - Moved `JobStore.attach_runner/2` from `init/1` to a new `handle_continue(:attach_and_start)` callback, so supervisor startup is no longer blocked by the synchronous call. + - Added `terminate/2` callback that calls `JobStore.detach_runner/2` (with `try/catch` for shutdown safety), centralizing cleanup that was previously scattered across individual exit paths. + - Added `handle_info({:EXIT, _pid, _reason})` clause to handle trapped exit signals from linked processes. + - Removed redundant inline `detach_runner` calls from `handle_call(:cancel)`, task result handler, and `:DOWN` handler — `terminate/2` now handles all detach cleanup. + - Changed `restart: :temporary` since job runners are one-shot processes that should not auto-restart on failure. + - Added `@impl true` to all `handle_info` clauses. + - Fixed pre-existing bug in `JobStore.detach_runner` handler where `update_in/2` macro result was incorrectly double-wrapped, corrupting state. + - Added test: start a runner, kill it externally (not via cancel), assert `JobStore` no longer contains the dead PID. --- @@ -400,6 +399,7 @@ - [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification. - CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`. - CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs. + - CSM-004: Fixed. `attach_runner` moved to `handle_continue`, `terminate/2` added for cleanup, `restart: :temporary` set, JobStore `detach_runner` bug fixed. - [ ] All high-severity items (CSM-006 to CSM-010) have been addressed. - [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`. - [x] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations). diff --git a/lib/bds/scripting/job_runner.ex b/lib/bds/scripting/job_runner.ex index 1d67c69..b28c061 100644 --- a/lib/bds/scripting/job_runner.ex +++ b/lib/bds/scripting/job_runner.ex @@ -1,7 +1,7 @@ defmodule BDS.Scripting.JobRunner do @moduledoc false - use GenServer + use GenServer, restart: :temporary def start_link(opts) do GenServer.start_link(__MODULE__, opts) @@ -27,8 +27,13 @@ defmodule BDS.Scripting.JobRunner do } Process.flag(:trap_exit, true) + {:ok, state, {:continue, :attach_and_start}} + end + + @impl true + def handle_continue(:attach_and_start, state) do :ok = BDS.Scripting.JobStore.attach_runner(state.job_id, self()) - {:ok, state, {:continue, :start_job}} + {:noreply, state, {:continue, :start_job}} end @impl true @@ -68,7 +73,6 @@ defmodule BDS.Scripting.JobRunner do finished_at: DateTime.utc_now() }) - :ok = BDS.Scripting.JobStore.detach_runner(state.job_id) {:stop, :normal, :ok, %{state | cancelled?: true}} end @@ -78,6 +82,7 @@ defmodule BDS.Scripting.JobRunner do {:noreply, state} end + @impl true def handle_info({ref, result}, %{task_ref: ref} = state) do Process.demonitor(ref, [:flush]) @@ -92,12 +97,12 @@ defmodule BDS.Scripting.JobRunner do end :ok = BDS.Scripting.JobStore.update_job(state.job_id, attrs) - :ok = BDS.Scripting.JobStore.detach_runner(state.job_id) end {:stop, :normal, %{state | completed?: true}} end + @impl true def handle_info({:DOWN, ref, :process, _pid, reason}, %{task_ref: ref} = state) do cond do state.completed? or state.cancelled? -> @@ -114,8 +119,21 @@ defmodule BDS.Scripting.JobRunner do finished_at: DateTime.utc_now() }) - :ok = BDS.Scripting.JobStore.detach_runner(state.job_id) {:stop, :normal, state} end end + + @impl true + def handle_info({:EXIT, _pid, _reason}, state) do + {:stop, :normal, state} + end + + @impl true + def terminate(_reason, state) do + try do + BDS.Scripting.JobStore.detach_runner(state.job_id) + catch + :exit, _ -> :ok + end + end end diff --git a/lib/bds/scripting/job_store.ex b/lib/bds/scripting/job_store.ex index c83ea39..bd0a38a 100644 --- a/lib/bds/scripting/job_store.ex +++ b/lib/bds/scripting/job_store.ex @@ -65,8 +65,7 @@ defmodule BDS.Scripting.JobStore do end def handle_call({:detach_runner, job_id}, _from, state) do - next_state = update_in(state.runners, &Map.delete(&1, job_id)) - {:reply, :ok, %{state | runners: next_state}} + {:reply, :ok, %{state | runners: Map.delete(state.runners, job_id)}} end def handle_call({:fetch_job, job_id}, _from, state) do diff --git a/test/bds/scripting/job_test.exs b/test/bds/scripting/job_test.exs index 183b6f7..aa97a9e 100644 --- a/test/bds/scripting/job_test.exs +++ b/test/bds/scripting/job_test.exs @@ -90,6 +90,26 @@ defmodule BDS.Scripting.JobTest do assert cancelled_job.finished_at != nil end + test "killing a runner detaches it from JobStore (CSM-004)" do + Application.put_env(:bds, :scripting, + runtime: BlockingRuntime, + timeout: 300_000, + max_reductions: 5_000_000, + job_timeout: :infinity, + job_max_reductions: :none + ) + + assert {:ok, job} = BDS.Scripting.start_job("irrelevant", "main") + _running_job = wait_for_job(job.id, &(&1.status == :running)) + + runner_pid = BDS.Scripting.JobStore.runner_for(job.id) + assert is_pid(runner_pid) + + GenServer.stop(runner_pid, :shutdown) + Process.sleep(50) + assert BDS.Scripting.JobStore.runner_for(job.id) == nil + end + defp wait_for_job(job_id, predicate, attempts \\ 50) defp wait_for_job(job_id, predicate, attempts) when attempts > 0 do