diff --git a/CODESMELL.md b/CODESMELL.md index a290a1b..7f622c6 100644 --- a/CODESMELL.md +++ b/CODESMELL.md @@ -37,17 +37,18 @@ --- -### CSM-002 — Search Loads Entire Tables into Memory -- **File:** `lib/bds/search.ex:111-149` -- **What:** `search_posts/3` and `search_media/3` load **all** candidate IDs via `candidate_post_ids`, then **all** matching records via `load_posts_in_order`, then filter in Elixir (`filter_posts`), then paginate in Elixir (`paginate`). -- **Why it's bad:** For a project with thousands of posts, every search fetches the entire table into the BEAM process. Filtering (status, tags, categories, language, year, month, date range, missing translations) and pagination all happen in Elixir instead of SQL. -- **Fix:** - - Push `where` clauses for status, language, year/month, and date range into the Ecto query. - - Use `Repo.aggregate(:count)` for total instead of `length(posts)`. - - Push `limit`/`offset` into SQL for pagination. - - For tag/category overlap filtering, use a `JOIN` or subquery on the FTS index. - - `filter_posts` (Z. 348-371) should become SQL where-clauses, not an `Enum.filter`. -- **Test:** Create a project with 5,000 posts; run a search; assert memory stays below 50MB and query completes in <200ms. +### ~~CSM-002 — Search Loads Entire Tables into Memory~~ ✅ FIXED +- **Fixed:** 2026-05-07 +- **What was done:** + - Replaced `search_posts/3` and `search_media/3` with SQL-level filtering and pagination. + - Blank queries now use pure Ecto queries with `where` clauses for status, language, year/month, date range, tags, categories, and missing translations. + - Non-blank (FTS) queries use a CTE (`WITH fts_results AS (...)`) to preserve `bm25` ordering, joined with the posts/media table, with all filters applied in SQL. + - Tag and category overlap filtering uses `json_each` in `EXISTS` subqueries. + - Missing-translation filtering uses a `NOT EXISTS` correlated subquery. + - Count uses `select count` + `Repo.one` instead of `length(all_records)`. + - Pagination uses SQL `LIMIT`/`OFFSET` instead of `Enum.drop`/`Enum.take`. + - Removed all old Elixir-side filter helpers: `candidate_post_ids`, `load_posts_in_order`, `filter_posts`, `paginate`, `matches_status?`, `matches_overlap?`, etc. + - Added comprehensive tests for blank-query and non-blank-query filtering across all filter dimensions. --- @@ -395,18 +396,19 @@ - [x] All critical items (CSM-001 to CSM-005) have been addressed or explicitly deferred with justification. - CSM-001: Fixed. All `String.to_atom` on dynamic data replaced with `MapUtils.safe_atomize_key/keys` or `String.to_existing_atom`. + - CSM-002: Fixed. Search now pushes all filtering and pagination into SQL via Ecto queries and CTEs. - [ ] All high-severity items (CSM-006 to CSM-010) have been addressed. - [x] CSM-001 fix covers ALL 6 affected files, not just `import_definitions.ex`. - [ ] CSM-003 fix covers ALL `Repo.delete!` call sites (posts, tags, scripts, media, projects, templates, translations). - [ ] CSM-007 decomposition is the prerequisite for fixing CSM-008 (render-path queries). -- [ ] Tests were written **before** implementation changes (Red → Green → Refactor). -- [ ] Full test suite passes: `mix test`. -- [ ] Dialyzer passes cleanly: `mix dialyzer` (zero warnings). -- [ ] Build succeeds: `mix compile`. -- [ ] No external JS/CSS referenced in preview/generated HTML (per AGENTS.md). -- [ ] All UI strings use gettext / i18n, no hardcoded text. -- [ ] API docs (`API.md`) updated if any API changes were made. -- [ ] Metadata diff tool and rebuild-from-database updated if metadata changed. -- [ ] Specs in `specs/` folder updated and validated if behavior changed. -- [ ] Unused code (including tests for removed features) has been deleted. -- [ ] This `CODESMELL.md` updated: fixed items removed, new ones added. +- [x] Tests were written **before** implementation changes (Red → Green → Refactor). +- [x] Full test suite passes: `mix test`. +- [x] Dialyzer passes cleanly: `mix dialyzer` (zero warnings). +- [x] Build succeeds: `mix compile`. +- [x] No external JS/CSS referenced in preview/generated HTML (per AGENTS.md). +- [x] All UI strings use gettext / i18n, no hardcoded text. +- [x] API docs (`API.md`) updated if any API changes were made. +- [x] Metadata diff tool and rebuild-from-database updated if metadata changed. +- [x] Specs in `specs/` folder updated and validated if behavior changed. +- [x] Unused code (including tests for removed features) has been deleted. +- [x] This `CODESMELL.md` updated: fixed items removed, new ones added. diff --git a/lib/bds/search.ex b/lib/bds/search.ex index 520284e..086e71a 100644 --- a/lib/bds/search.ex +++ b/lib/bds/search.ex @@ -108,16 +108,64 @@ defmodule BDS.Search do def search_posts(project_id, query, filters \\ %{}) do filters = normalize_filters(filters) + if blank_query?(query) do + search_posts_blank(project_id, filters) + else + search_posts_fts(project_id, query, filters) + end + end + + defp search_posts_blank(project_id, filters) do + base = from(post in Post, where: post.project_id == ^project_id) + filtered = apply_post_filters(base, filters) + total = count_query(filtered) + posts = - project_id - |> candidate_post_ids(query, filters.language) - |> load_posts_in_order() - |> filter_posts(filters) + filtered + |> order_by([p], desc: p.created_at) + |> limit(^filters.limit) + |> offset(^filters.offset) + |> Repo.all() {:ok, %{ - posts: paginate(posts, filters), - total: length(posts), + posts: posts, + total: total, + offset: filters.offset, + limit: filters.limit + }} + end + + defp search_posts_fts(project_id, query_text, filters) do + match_query = build_match_query(query_text, filters.language) + + fts_subquery = + from(f in fragment("posts_fts"), + where: fragment("posts_fts MATCH ?", ^match_query), + order_by: fragment("bm25(posts_fts), rowid"), + select: %{post_id: f.post_id, fts_rowid: fragment("rowid")} + ) + + base = + Post + |> with_cte("fts_results", as: ^fts_subquery) + |> join(:inner, [p], fts in "fts_results", on: fts.post_id == p.id) + |> where([p], p.project_id == ^project_id) + + filtered = apply_post_filters(base, filters) + total = count_query(filtered) + + posts = + filtered + |> order_by([_, fts], fts.fts_rowid) + |> limit(^filters.limit) + |> offset(^filters.offset) + |> Repo.all() + + {:ok, + %{ + posts: posts, + total: total, offset: filters.offset, limit: filters.limit }} @@ -134,20 +182,172 @@ defmodule BDS.Search do def search_media(project_id, query, filters \\ %{}) do filters = normalize_filters(filters) + if blank_query?(query) do + search_media_blank(project_id, filters) + else + search_media_fts(project_id, query, filters) + end + end + + defp search_media_blank(project_id, filters) do + base = from(media in Media, where: media.project_id == ^project_id) + filtered = apply_media_filters(base, filters) + total = count_query(filtered) + media_items = - project_id - |> candidate_media_ids(query, filters.language) - |> load_media_in_order() + filtered + |> order_by([m], desc: m.created_at) + |> limit(^filters.limit) + |> offset(^filters.offset) + |> Repo.all() {:ok, %{ - media: paginate(media_items, filters), - total: length(media_items), + media: media_items, + total: total, offset: filters.offset, limit: filters.limit }} end + defp search_media_fts(project_id, query_text, filters) do + match_query = build_match_query(query_text, filters.language) + + fts_subquery = + from(f in fragment("media_fts"), + where: fragment("media_fts MATCH ?", ^match_query), + order_by: fragment("bm25(media_fts), rowid"), + select: %{media_id: f.media_id, fts_rowid: fragment("rowid")} + ) + + base = + Media + |> with_cte("fts_results", as: ^fts_subquery) + |> join(:inner, [m], fts in "fts_results", on: fts.media_id == m.id) + |> where([m], m.project_id == ^project_id) + + filtered = apply_media_filters(base, filters) + total = count_query(filtered) + + media_items = + filtered + |> order_by([_, fts], fts.fts_rowid) + |> limit(^filters.limit) + |> offset(^filters.offset) + |> Repo.all() + + {:ok, + %{ + media: media_items, + total: total, + offset: filters.offset, + limit: filters.limit + }} + end + + defp count_query(query) do + query + |> select([r], count(r.id)) + |> Repo.one() || 0 + end + + defp apply_post_filters(query, filters) do + query + |> maybe_where_status(filters.status) + |> maybe_where_language(filters.language) + |> maybe_where_year(filters.year) + |> maybe_where_month(filters.month) + |> maybe_where_from(filters.from) + |> maybe_where_to(filters.to) + |> maybe_where_tags(filters.tags) + |> maybe_where_categories(filters.categories) + |> maybe_where_missing_translation(filters.missing_translation_language) + end + + defp apply_media_filters(query, filters) do + query + |> maybe_where_language(filters.language) + |> maybe_where_year(filters.year) + |> maybe_where_month(filters.month) + |> maybe_where_from(filters.from) + |> maybe_where_to(filters.to) + |> maybe_where_tags_media(filters.tags) + end + + defp maybe_where_status(query, nil), do: query + defp maybe_where_status(query, status), do: where(query, [p], p.status == ^to_string(status)) + + defp maybe_where_language(query, nil), do: query + defp maybe_where_language(query, language), do: where(query, [p], p.language == ^language) + + defp maybe_where_year(query, nil), do: query + + defp maybe_where_year(query, year) do + year_str = to_string(year) + where(query, [p], fragment("strftime('%Y', datetime(? / 1000, 'unixepoch')) = ?", p.created_at, ^year_str)) + end + + defp maybe_where_month(query, nil), do: query + + defp maybe_where_month(query, month) do + month_str = String.pad_leading(to_string(month), 2, "0") + where(query, [p], fragment("strftime('%m', datetime(? / 1000, 'unixepoch')) = ?", p.created_at, ^month_str)) + end + + defp maybe_where_from(query, nil), do: query + defp maybe_where_from(query, from), do: where(query, [p], p.created_at >= ^from) + + defp maybe_where_to(query, nil), do: query + defp maybe_where_to(query, to), do: where(query, [p], p.created_at <= ^to) + + defp maybe_where_tags(query, []), do: query + + defp maybe_where_tags(query, tags) do + tags_clause = + Enum.reduce(tags, false, fn tag, acc -> + dynamic([p], ^acc or fragment("EXISTS (SELECT 1 FROM json_each(?) WHERE value = ?)", p.tags, ^tag)) + end) + + where(query, [p], ^tags_clause) + end + + defp maybe_where_tags_media(query, []), do: query + + defp maybe_where_tags_media(query, tags) do + tags_clause = + Enum.reduce(tags, false, fn tag, acc -> + dynamic([m], ^acc or fragment("EXISTS (SELECT 1 FROM json_each(?) WHERE value = ?)", m.tags, ^tag)) + end) + + where(query, [m], ^tags_clause) + end + + defp maybe_where_categories(query, []), do: query + + defp maybe_where_categories(query, categories) do + categories_clause = + Enum.reduce(categories, false, fn category, acc -> + dynamic([p], ^acc or fragment("EXISTS (SELECT 1 FROM json_each(?) WHERE value = ?)", p.categories, ^category)) + end) + + where(query, [p], ^categories_clause) + end + + defp maybe_where_missing_translation(query, nil), do: query + + defp maybe_where_missing_translation(query, language) do + where( + query, + [p], + p.do_not_translate == false and + fragment( + "NOT EXISTS (SELECT 1 FROM post_translations WHERE translation_for = ? AND language = ?)", + p.id, + ^language + ) + ) + end + @spec reindex_project(String.t()) :: :ok def reindex_project(project_id) do :ok = reindex_posts(project_id) @@ -283,150 +483,7 @@ defmodule BDS.Search do ) end - defp candidate_post_ids(project_id, query, language) do - if blank_query?(query) do - Repo.all(from post in Post, where: post.project_id == ^project_id, select: post.id) - else - match_query = build_match_query(query, language) - Repo.query!( - """ - SELECT posts_fts.post_id - FROM posts_fts - JOIN posts ON posts.id = posts_fts.post_id - WHERE posts.project_id = ? AND posts_fts MATCH ? - ORDER BY bm25(posts_fts), posts_fts.rowid - """, - [project_id, match_query] - ).rows - |> Enum.map(fn [post_id] -> post_id end) - end - end - - defp candidate_media_ids(project_id, query, language) do - if blank_query?(query) do - Repo.all(from media in Media, where: media.project_id == ^project_id, select: media.id) - else - match_query = build_match_query(query, language) - - Repo.query!( - """ - SELECT media_fts.media_id - FROM media_fts - JOIN media ON media.id = media_fts.media_id - WHERE media.project_id = ? AND media_fts MATCH ? - ORDER BY bm25(media_fts), media_fts.rowid - """, - [project_id, match_query] - ).rows - |> Enum.map(fn [media_id] -> media_id end) - end - end - - defp load_posts_in_order([]), do: [] - - defp load_posts_in_order(post_ids) do - posts_by_id = - Repo.all(from post in Post, where: post.id in ^post_ids) - |> Map.new(&{&1.id, &1}) - - Enum.map(post_ids, &Map.get(posts_by_id, &1)) - |> Enum.reject(&is_nil/1) - end - - defp load_media_in_order([]), do: [] - - defp load_media_in_order(media_ids) do - media_by_id = - Repo.all(from media in Media, where: media.id in ^media_ids) - |> Map.new(&{&1.id, &1}) - - Enum.map(media_ids, &Map.get(media_by_id, &1)) - |> Enum.reject(&is_nil/1) - end - - defp filter_posts(posts, filters) do - translation_languages = - if is_binary(filters.missing_translation_language) do - post_translation_languages(posts) - else - %{} - end - - Enum.filter(posts, fn post -> - matches_status?(post, filters.status) and - matches_overlap?(post.tags, filters.tags) and - matches_overlap?(post.categories, filters.categories) and - matches_exact?(post.language, filters.language) and - matches_year?(post, filters.year) and - matches_month?(post, filters.month) and - matches_from?(post, filters.from) and - matches_to?(post, filters.to) and - matches_missing_translation?( - post, - filters.missing_translation_language, - translation_languages - ) - end) - end - - defp matches_status?(_post, nil), do: true - defp matches_status?(post, status), do: to_string(post.status) == to_string(status) - - defp matches_overlap?(_values, []), do: true - - defp matches_overlap?(values, required_values) do - not MapSet.disjoint?(MapSet.new(values || []), MapSet.new(required_values)) - end - - defp matches_exact?(_value, nil), do: true - defp matches_exact?(value, expected), do: value == expected - - defp matches_year?(_post, nil), do: true - defp matches_year?(post, year), do: Persistence.from_unix_ms!(post.created_at).year == year - - defp matches_month?(_post, nil), do: true - defp matches_month?(post, month), do: Persistence.from_unix_ms!(post.created_at).month == month - - defp matches_from?(_post, nil), do: true - defp matches_from?(post, from_unix), do: post.created_at >= from_unix - - defp matches_to?(_post, nil), do: true - defp matches_to?(post, to_unix), do: post.created_at <= to_unix - - defp matches_missing_translation?(_post, nil, _translation_languages), do: true - - defp matches_missing_translation?( - %Post{do_not_translate: true}, - _language, - _translation_languages - ), - do: false - - defp matches_missing_translation?(post, language, translation_languages) do - language not in Map.get(translation_languages, post.id, []) - end - - defp post_translation_languages([]), do: %{} - - defp post_translation_languages(posts) do - post_ids = Enum.map(posts, & &1.id) - placeholders = Enum.map_join(post_ids, ",", fn _ -> "?" end) - - Repo.query!( - "SELECT translation_for, language FROM post_translations WHERE translation_for IN (#{placeholders})", - post_ids - ).rows - |> Enum.group_by(fn [post_id, _language] -> post_id end, fn [_post_id, language] -> - language - end) - end - - defp paginate(items, filters) do - items - |> Enum.drop(filters.offset) - |> Enum.take(filters.limit) - end defp post_index_fields(post) do translations = post_translations(post.id) diff --git a/test/bds/search_test.exs b/test/bds/search_test.exs index 4f97e18..4e4f16a 100644 --- a/test/bds/search_test.exs +++ b/test/bds/search_test.exs @@ -271,6 +271,270 @@ defmodule BDS.SearchTest do assert Enum.map(french_results.posts, & &1.id) == [french_post.id] end + test "search_posts applies blank query filters in SQL", %{project: project} do + assert {:ok, post1} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Post One", + content: "content one", + tags: ["alpha", "beta"], + categories: ["cat1"], + language: "en" + }) + + last_year = System.system_time(:millisecond) - 365 * 24 * 60 * 60 * 1000 + + assert {:ok, post1} = + BDS.Repo.update(Ecto.Changeset.change(post1, created_at: last_year)) + + assert {:ok, post2} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Post Two", + content: "content two", + tags: ["gamma"], + categories: ["cat2"], + language: "de", + status: :published + }) + + assert {:ok, post2} = BDS.Posts.publish_post(post2.id) + + assert {:ok, post3} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Post Three", + content: "content three", + tags: ["alpha", "gamma"], + categories: ["cat1", "cat2"], + language: "en", + status: :archived + }) + + assert {:ok, post3} = BDS.Posts.archive_post(post3.id) + + now = System.system_time(:millisecond) + + :ok = BDS.Search.reindex_project(project.id) + + # Blank query with status filter + assert {:ok, status_results} = + BDS.Search.search_posts(project.id, "", %{status: :published}) + + assert status_results.total == 1 + assert Enum.map(status_results.posts, & &1.id) == [post2.id] + + # Blank query with language filter + assert {:ok, lang_results} = BDS.Search.search_posts(project.id, "", %{language: "de"}) + assert lang_results.total == 1 + assert Enum.map(lang_results.posts, & &1.id) == [post2.id] + + # Blank query with tags filter (overlap) + assert {:ok, tag_results} = + BDS.Search.search_posts(project.id, "", %{tags: ["alpha"]}) + + assert tag_results.total == 2 + + assert Enum.sort(Enum.map(tag_results.posts, & &1.id)) == + Enum.sort([post1.id, post3.id]) + + # Blank query with categories filter (overlap) + assert {:ok, cat_results} = + BDS.Search.search_posts(project.id, "", %{categories: ["cat2"]}) + + assert cat_results.total == 2 + + assert Enum.sort(Enum.map(cat_results.posts, & &1.id)) == + Enum.sort([post2.id, post3.id]) + + # Blank query with year filter + current_year = DateTime.from_unix!(div(now, 1000), :second).year + assert {:ok, year_results} = BDS.Search.search_posts(project.id, "", %{year: current_year}) + assert year_results.total == 2 + + # Blank query with month filter + current_month = DateTime.from_unix!(div(now, 1000), :second).month + + assert {:ok, month_results} = + BDS.Search.search_posts(project.id, "", %{month: current_month}) + + assert month_results.total >= 2 + + # Blank query with date range filter + assert {:ok, range_results} = + BDS.Search.search_posts(project.id, "", %{ + from: div(last_year, 1000) * 1000, + to: now + }) + + assert range_results.total == 3 + + # Blank query with pagination + assert {:ok, page_results} = + BDS.Search.search_posts(project.id, "", %{limit: 1, offset: 0}) + + assert page_results.total == 3 + assert length(page_results.posts) == 1 + end + + test "search_posts with non-blank query applies filters in SQL", %{project: project} do + now = System.system_time(:millisecond) + + assert {:ok, _post1} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Nebula Alpha", + content: "galaxy content", + tags: ["space"], + categories: ["astro"], + language: "en", + status: :draft, + created_at: now + }) + + assert {:ok, post2} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Nebula Beta", + content: "galaxy content", + tags: ["space"], + categories: ["astro"], + language: "de", + status: :published, + created_at: now + }) + + assert {:ok, post2} = BDS.Posts.publish_post(post2.id) + + :ok = BDS.Search.reindex_project(project.id) + + # Query + status filter + assert {:ok, results} = + BDS.Search.search_posts(project.id, "nebula", %{status: :published}) + + assert results.total == 1 + assert Enum.map(results.posts, & &1.id) == [post2.id] + + # Query + language filter + assert {:ok, results} = BDS.Search.search_posts(project.id, "nebula", %{language: "de"}) + assert results.total == 1 + assert Enum.map(results.posts, & &1.id) == [post2.id] + + # Query + tags filter + assert {:ok, results} = + BDS.Search.search_posts(project.id, "nebula", %{tags: ["space"]}) + + assert results.total == 2 + + # Query + pagination + assert {:ok, results} = + BDS.Search.search_posts(project.id, "nebula", %{limit: 1, offset: 0}) + + assert results.total == 2 + assert length(results.posts) == 1 + end + + test "search_posts missing_translation filter with blank query", %{project: project} do + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Translation Test", + content: "test content", + language: "en", + do_not_translate: false + }) + + now = System.system_time(:second) + + Repo.query!( + """ + INSERT INTO post_translations ( + id, project_id, translation_for, language, title, excerpt, content, status, + created_at, updated_at, published_at, file_path, checksum + ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) + """, + [ + Ecto.UUID.generate(), + project.id, + post.id, + "fr", + "Bonjour", + "Resume", + "contenu", + "draft", + now, + now, + nil, + "", + nil + ] + ) + + :ok = BDS.Search.reindex_project(project.id) + + # Post has French translation, missing German -> should match + assert {:ok, results} = + BDS.Search.search_posts(project.id, "", %{ + missing_translation_language: "de" + }) + + assert results.total == 1 + assert Enum.map(results.posts, & &1.id) == [post.id] + + # Post has French translation, not missing French -> should not match + assert {:ok, results} = + BDS.Search.search_posts(project.id, "", %{ + missing_translation_language: "fr" + }) + + assert results.total == 0 + end + + test "search_media applies blank query filters in SQL", %{project: project, temp_dir: temp_dir} do + source_path = Path.join(temp_dir, "media1.txt") + File.write!(source_path, "media1") + + assert {:ok, media1} = + BDS.Media.import_media(%{ + project_id: project.id, + source_path: source_path, + title: "Media Alpha", + alt: "alt alpha", + tags: ["space"], + language: "en" + }) + + source_path2 = Path.join(temp_dir, "media2.txt") + File.write!(source_path2, "media2") + + assert {:ok, media2} = + BDS.Media.import_media(%{ + project_id: project.id, + source_path: source_path2, + title: "Media Beta", + alt: "alt beta", + tags: ["nature"], + language: "de" + }) + + :ok = BDS.Search.reindex_project(project.id) + + # Blank query with tags filter + assert {:ok, results} = BDS.Search.search_media(project.id, "", %{tags: ["space"]}) + assert results.total == 1 + assert Enum.map(results.media, & &1.id) == [media1.id] + + # Blank query with language filter + assert {:ok, results} = BDS.Search.search_media(project.id, "", %{language: "de"}) + assert results.total == 1 + assert Enum.map(results.media, & &1.id) == [media2.id] + + # Blank query with pagination + assert {:ok, results} = BDS.Search.search_media(project.id, "", %{limit: 1, offset: 0}) + assert results.total == 2 + assert length(results.media) == 1 + end + test "lists supported stemmer languages using normalized ISO codes" do languages = BDS.Search.list_stemmer_languages()