fix: fixed CSM-004
This commit is contained in:
24
CODESMELL.md
24
CODESMELL.md
@@ -72,18 +72,17 @@
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner
|
### ~~CSM-004 — Blocking `init/1` + Missing `terminate/2` in Job Runner~~ ✅ FIXED
|
||||||
- **File:** `lib/bds/scripting/job_runner.ex:28-35`
|
- **Fixed:** 2026-05-08
|
||||||
- **What:**
|
- **What was done:**
|
||||||
1. Synchronous `GenServer.call` to `JobStore.attach_runner/2` inside `init/1` (Z. 30) blocks the supervisor if `JobStore` is slow.
|
- Moved `JobStore.attach_runner/2` from `init/1` to a new `handle_continue(:attach_and_start)` callback, so supervisor startup is no longer blocked by the synchronous call.
|
||||||
2. `Process.flag(:trap_exit, true)` (Z. 29) is set but there is **no** `terminate/2` callback and **no** `{:EXIT, ...}` handler in `handle_info`.
|
- Added `terminate/2` callback that calls `JobStore.detach_runner/2` (with `try/catch` for shutdown safety), centralizing cleanup that was previously scattered across individual exit paths.
|
||||||
- **Why it's bad:**
|
- Added `handle_info({:EXIT, _pid, _reason})` clause to handle trapped exit signals from linked processes.
|
||||||
- Supervisor child startup hangs if `JobStore` is slow or unavailable.
|
- Removed redundant inline `detach_runner` calls from `handle_call(:cancel)`, task result handler, and `:DOWN` handler — `terminate/2` now handles all detach cleanup.
|
||||||
- If the runner process is killed (not via `:cancel`), the PID leaks in `JobStore.runners`. The `detach_runner/2` call only happens in `handle_call(:cancel)` (Z. 71) and `handle_info({ref, result})` (Z. 95) and `handle_info({:DOWN, ...})` (Z. 117). A linked process crash with `trap_exit` would send `{:EXIT, pid, reason}` which has **no handler**, so the runner stops without detaching.
|
- Changed `restart: :temporary` since job runners are one-shot processes that should not auto-restart on failure.
|
||||||
- **Fix:**
|
- Added `@impl true` to all `handle_info` clauses.
|
||||||
- Move `attach_runner` to `handle_continue/2`.
|
- Fixed pre-existing bug in `JobStore.detach_runner` handler where `update_in/2` macro result was incorrectly double-wrapped, corrupting state.
|
||||||
- Either remove `trap_exit` (the task uses `async_nolink`, so links aren't the issue) OR add `terminate/2` that calls `JobStore.detach_runner/2` and add an `{:EXIT, ...}` clause in `handle_info`.
|
- Added test: start a runner, kill it externally (not via cancel), assert `JobStore` no longer contains the dead PID.
|
||||||
- **Test:** Start a runner, kill it (not via cancel); assert `JobStore` does not contain the dead PID.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@@ -400,6 +399,7 @@
|
|||||||
- [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification.
|
- [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification.
|
||||||
- CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`.
|
- CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`.
|
||||||
- CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs.
|
- CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs.
|
||||||
|
- CSM-004: Fixed. `attach_runner` moved to `handle_continue`, `terminate/2` added for cleanup, `restart: :temporary` set, JobStore `detach_runner` bug fixed.
|
||||||
- [ ] All high-severity items (CSM-006 to CSM-010) have been addressed.
|
- [ ] All high-severity items (CSM-006 to CSM-010) have been addressed.
|
||||||
- [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`.
|
- [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`.
|
||||||
- [x] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations).
|
- [x] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations).
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
defmodule BDS.Scripting.JobRunner do
|
defmodule BDS.Scripting.JobRunner do
|
||||||
@moduledoc false
|
@moduledoc false
|
||||||
|
|
||||||
use GenServer
|
use GenServer, restart: :temporary
|
||||||
|
|
||||||
def start_link(opts) do
|
def start_link(opts) do
|
||||||
GenServer.start_link(__MODULE__, opts)
|
GenServer.start_link(__MODULE__, opts)
|
||||||
@@ -27,8 +27,13 @@ defmodule BDS.Scripting.JobRunner do
|
|||||||
}
|
}
|
||||||
|
|
||||||
Process.flag(:trap_exit, true)
|
Process.flag(:trap_exit, true)
|
||||||
|
{:ok, state, {:continue, :attach_and_start}}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_continue(:attach_and_start, state) do
|
||||||
:ok = BDS.Scripting.JobStore.attach_runner(state.job_id, self())
|
:ok = BDS.Scripting.JobStore.attach_runner(state.job_id, self())
|
||||||
{:ok, state, {:continue, :start_job}}
|
{:noreply, state, {:continue, :start_job}}
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl true
|
@impl true
|
||||||
@@ -68,7 +73,6 @@ defmodule BDS.Scripting.JobRunner do
|
|||||||
finished_at: DateTime.utc_now()
|
finished_at: DateTime.utc_now()
|
||||||
})
|
})
|
||||||
|
|
||||||
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
|
|
||||||
{:stop, :normal, :ok, %{state | cancelled?: true}}
|
{:stop, :normal, :ok, %{state | cancelled?: true}}
|
||||||
end
|
end
|
||||||
|
|
||||||
@@ -78,6 +82,7 @@ defmodule BDS.Scripting.JobRunner do
|
|||||||
{:noreply, state}
|
{:noreply, state}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
def handle_info({ref, result}, %{task_ref: ref} = state) do
|
def handle_info({ref, result}, %{task_ref: ref} = state) do
|
||||||
Process.demonitor(ref, [:flush])
|
Process.demonitor(ref, [:flush])
|
||||||
|
|
||||||
@@ -92,12 +97,12 @@ defmodule BDS.Scripting.JobRunner do
|
|||||||
end
|
end
|
||||||
|
|
||||||
:ok = BDS.Scripting.JobStore.update_job(state.job_id, attrs)
|
:ok = BDS.Scripting.JobStore.update_job(state.job_id, attrs)
|
||||||
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
|
|
||||||
end
|
end
|
||||||
|
|
||||||
{:stop, :normal, %{state | completed?: true}}
|
{:stop, :normal, %{state | completed?: true}}
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
def handle_info({:DOWN, ref, :process, _pid, reason}, %{task_ref: ref} = state) do
|
def handle_info({:DOWN, ref, :process, _pid, reason}, %{task_ref: ref} = state) do
|
||||||
cond do
|
cond do
|
||||||
state.completed? or state.cancelled? ->
|
state.completed? or state.cancelled? ->
|
||||||
@@ -114,8 +119,21 @@ defmodule BDS.Scripting.JobRunner do
|
|||||||
finished_at: DateTime.utc_now()
|
finished_at: DateTime.utc_now()
|
||||||
})
|
})
|
||||||
|
|
||||||
:ok = BDS.Scripting.JobStore.detach_runner(state.job_id)
|
|
||||||
{:stop, :normal, state}
|
{:stop, :normal, state}
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def handle_info({:EXIT, _pid, _reason}, state) do
|
||||||
|
{:stop, :normal, state}
|
||||||
|
end
|
||||||
|
|
||||||
|
@impl true
|
||||||
|
def terminate(_reason, state) do
|
||||||
|
try do
|
||||||
|
BDS.Scripting.JobStore.detach_runner(state.job_id)
|
||||||
|
catch
|
||||||
|
:exit, _ -> :ok
|
||||||
|
end
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|||||||
@@ -65,8 +65,7 @@ defmodule BDS.Scripting.JobStore do
|
|||||||
end
|
end
|
||||||
|
|
||||||
def handle_call({:detach_runner, job_id}, _from, state) do
|
def handle_call({:detach_runner, job_id}, _from, state) do
|
||||||
next_state = update_in(state.runners, &Map.delete(&1, job_id))
|
{:reply, :ok, %{state | runners: Map.delete(state.runners, job_id)}}
|
||||||
{:reply, :ok, %{state | runners: next_state}}
|
|
||||||
end
|
end
|
||||||
|
|
||||||
def handle_call({:fetch_job, job_id}, _from, state) do
|
def handle_call({:fetch_job, job_id}, _from, state) do
|
||||||
|
|||||||
@@ -90,6 +90,26 @@ defmodule BDS.Scripting.JobTest do
|
|||||||
assert cancelled_job.finished_at != nil
|
assert cancelled_job.finished_at != nil
|
||||||
end
|
end
|
||||||
|
|
||||||
|
test "killing a runner detaches it from JobStore (CSM-004)" do
|
||||||
|
Application.put_env(:bds, :scripting,
|
||||||
|
runtime: BlockingRuntime,
|
||||||
|
timeout: 300_000,
|
||||||
|
max_reductions: 5_000_000,
|
||||||
|
job_timeout: :infinity,
|
||||||
|
job_max_reductions: :none
|
||||||
|
)
|
||||||
|
|
||||||
|
assert {:ok, job} = BDS.Scripting.start_job("irrelevant", "main")
|
||||||
|
_running_job = wait_for_job(job.id, &(&1.status == :running))
|
||||||
|
|
||||||
|
runner_pid = BDS.Scripting.JobStore.runner_for(job.id)
|
||||||
|
assert is_pid(runner_pid)
|
||||||
|
|
||||||
|
GenServer.stop(runner_pid, :shutdown)
|
||||||
|
Process.sleep(50)
|
||||||
|
assert BDS.Scripting.JobStore.runner_for(job.id) == nil
|
||||||
|
end
|
||||||
|
|
||||||
defp wait_for_job(job_id, predicate, attempts \\ 50)
|
defp wait_for_job(job_id, predicate, attempts \\ 50)
|
||||||
|
|
||||||
defp wait_for_job(job_id, predicate, attempts) when attempts > 0 do
|
defp wait_for_job(job_id, predicate, attempts) when attempts > 0 do
|
||||||
|
|||||||
Reference in New Issue
Block a user