feat: finalisation for search

This commit is contained in:
2026-04-23 17:19:17 +02:00
parent 5d16a89d1c
commit 485c4b65b7
4 changed files with 195 additions and 19 deletions

View File

@@ -9,12 +9,63 @@ defmodule BDS.Search do
alias BDS.Projects
alias BDS.Repo
@stemmer_algorithms %{
"da" => :danish,
"nl" => :dutch,
"en" => :english,
"fi" => :finnish,
"fr" => :french,
"de" => :german,
"hu" => :hungarian,
"it" => :italian,
"no" => :norwegian,
"pt" => :portuguese,
"ro" => :romanian,
"ru" => :russian,
"es" => :spanish,
"sv" => :swedish,
"tr" => :turkish
}
@language_hints [
{"de", ~w(der die das und ein eine ist sind läuft laufen morgen fluss entlang)},
{"fr", ~w(je tu il elle nous vous ils elles le la les un une des courir cours matin travail)},
{"es", ~w(el la los las un una unos unas y que para con pero)},
{"it", ~w(il lo la gli le un una uno e che per con ogni mattina)}
]
def list_stemmer_languages do
@stemmer_algorithms
|> Map.keys()
|> Enum.sort()
end
def detect_language(text) do
normalized_text = text |> to_string() |> String.downcase()
cond do
normalized_text == "" -> "en"
String.match?(normalized_text, ~r/[äöüß]/u) -> "de"
String.match?(normalized_text, ~r/[àâçéèêëîïôùûüÿœ]/u) -> "fr"
String.match?(normalized_text, ~r/[ñ¡¿]/u) -> "es"
true -> detect_language_from_hints(normalized_text)
end
end
def stem(text, language \\ nil) do
language = normalize_language(language || detect_language(text))
text
|> tokenize_text()
|> Enum.map_join(" ", &stem_token(&1, language))
end
def search_posts(project_id, query, filters \\ %{}) do
filters = normalize_filters(filters)
posts =
project_id
|> candidate_post_ids(query)
|> candidate_post_ids(query, filters.language)
|> load_posts_in_order()
|> filter_posts(filters)
@@ -32,7 +83,7 @@ defmodule BDS.Search do
media_items =
project_id
|> candidate_media_ids(query)
|> candidate_media_ids(query, filters.language)
|> load_media_in_order()
{:ok,
@@ -111,10 +162,12 @@ defmodule BDS.Search do
:ok
end
defp candidate_post_ids(project_id, query) do
defp candidate_post_ids(project_id, query, language) do
if blank_query?(query) do
Repo.all(from post in Post, where: post.project_id == ^project_id, select: post.id)
else
match_query = build_match_query(query, language)
Repo.query!(
"""
SELECT posts_fts.post_id
@@ -123,16 +176,18 @@ defmodule BDS.Search do
WHERE posts.project_id = ? AND posts_fts MATCH ?
ORDER BY bm25(posts_fts), posts_fts.rowid
""",
[project_id, query]
[project_id, match_query]
).rows
|> Enum.map(fn [post_id] -> post_id end)
end
end
defp candidate_media_ids(project_id, query) do
defp candidate_media_ids(project_id, query, language) do
if blank_query?(query) do
Repo.all(from media in Media, where: media.project_id == ^project_id, select: media.id)
else
match_query = build_match_query(query, language)
Repo.query!(
"""
SELECT media_fts.media_id
@@ -141,7 +196,7 @@ defmodule BDS.Search do
WHERE media.project_id = ? AND media_fts MATCH ?
ORDER BY bm25(media_fts), media_fts.rowid
""",
[project_id, query]
[project_id, match_query]
).rows
|> Enum.map(fn [media_id] -> media_id end)
end
@@ -242,16 +297,23 @@ defmodule BDS.Search do
defp post_index_fields(post) do
translations = post_translations(post.id)
post_language = normalize_language(post.language)
title = [post.title | Enum.map(translations, &Map.get(&1, "title"))] |> join_text()
excerpt = [post.excerpt | Enum.map(translations, &Map.get(&1, "excerpt"))] |> join_text()
content =
[post_content(post) | Enum.map(translations, &translation_content(post.project_id, &1))]
title =
[stem(post.title, post_language) | Enum.map(translations, &stem(Map.get(&1, "title"), Map.get(&1, "language")))]
|> join_text()
tags = Enum.join(post.tags || [], " ")
categories = Enum.join(post.categories || [], " ")
excerpt =
[stem(post.excerpt, post_language) | Enum.map(translations, &stem(Map.get(&1, "excerpt"), Map.get(&1, "language")))]
|> join_text()
content =
[stem(post_content(post), post_language) |
Enum.map(translations, &stem(translation_content(post.project_id, &1), Map.get(&1, "language")))]
|> join_text()
tags = stem(Enum.join(post.tags || [], " "), post_language)
categories = stem(Enum.join(post.categories || [], " "), post_language)
{title, excerpt, content, tags, categories}
end
@@ -260,11 +322,17 @@ defmodule BDS.Search do
translations =
Repo.all(from translation in MediaTranslation, where: translation.translation_for == ^media.id)
title = [media.title | Enum.map(translations, & &1.title)] |> join_text()
alt = [media.alt | Enum.map(translations, & &1.alt)] |> join_text()
caption = [media.caption | Enum.map(translations, & &1.caption)] |> join_text()
original_name = media.original_name || ""
tags = Enum.join(media.tags || [], " ")
media_language = normalize_language(media.language)
title = [stem(media.title, media_language) | Enum.map(translations, &stem(&1.title, &1.language))] |> join_text()
alt = [stem(media.alt, media_language) | Enum.map(translations, &stem(&1.alt, &1.language))] |> join_text()
caption =
[stem(media.caption, media_language) | Enum.map(translations, &stem(&1.caption, &1.language))]
|> join_text()
original_name = stem(media.original_name || "", media_language)
tags = stem(Enum.join(media.tags || [], " "), media_language)
{title, alt, caption, original_name, tags}
end
@@ -329,6 +397,73 @@ defmodule BDS.Search do
|> Enum.join("\n")
end
defp build_match_query(query, language) do
query
|> query_variants(language)
|> Enum.map_join(" OR ", fn tokens ->
tokens
|> Enum.map_join(" AND ", &quoted_term/1)
|> then(&"(" <> &1 <> ")")
end)
end
defp query_variants(query, language) do
languages = query_languages(query, language)
tokens = tokenize_text(query)
languages
|> Enum.map(fn stemmer_language -> Enum.map(tokens, &stem_token(&1, stemmer_language)) end)
|> Enum.reject(&Enum.empty?/1)
|> Enum.uniq()
end
defp query_languages(query, nil) do
detected = detect_language(query)
([detected] ++ list_stemmer_languages())
|> Enum.uniq()
end
defp query_languages(_query, language), do: [normalize_language(language)]
defp quoted_term(term), do: ~s("#{String.replace(term, ~s("), ~s(\"))}")
defp tokenize_text(nil), do: []
defp tokenize_text(text) do
Regex.scan(~r/[[:alnum:]]+/u, to_string(text))
|> List.flatten()
|> Enum.map(&String.downcase/1)
end
defp stem_token(token, language) do
case Map.fetch(@stemmer_algorithms, normalize_language(language)) do
{:ok, algorithm} -> apply(Stemex, algorithm, [token])
:error -> token
end
rescue
_error -> token
end
defp normalize_language(nil), do: "en"
defp normalize_language(language) do
language
|> to_string()
|> String.downcase()
|> String.split("-", parts: 2)
|> hd()
|> then(fn code -> if Map.has_key?(@stemmer_algorithms, code), do: code, else: "en" end)
end
defp detect_language_from_hints(text) do
tokens = MapSet.new(tokenize_text(text))
Enum.find_value(@language_hints, "en", fn {language, hints} ->
if Enum.any?(hints, &MapSet.member?(tokens, &1)), do: language, else: false
end)
end
defp normalize_filters(filters) do
%{
status: attr(filters, :status),

View File

@@ -26,7 +26,8 @@ defmodule BDS.MixProject do
{:luerl, "~> 1.5"},
{:jason, "~> 1.4"},
{:plug, "~> 1.18"},
{:image, "~> 0.65"}
{:image, "~> 0.65"},
{:stemex, "~> 0.2.1"}
]
end

View File

@@ -7,6 +7,7 @@
"ecto_sql": {:hex, :ecto_sql, "3.13.5", "2f8282b2ad97bf0f0d3217ea0a6fff320ead9e2f8770f810141189d182dc304e", [:mix], [{:db_connection, "~> 2.4.1 or ~> 2.5", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.7", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.19 or ~> 1.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.1 or ~> 2.2", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "aa36751f4e6a2b56ae79efb0e088042e010ff4935fc8684e74c23b1f49e25fdc"},
"ecto_sqlite3": {:hex, :ecto_sqlite3, "0.22.0", "edab2d0f701b7dd05dcf7e2d97769c106aff62b5cfddc000d1dd6f46b9cbd8c3", [:mix], [{:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:ecto_sql, "~> 3.13.0", [hex: :ecto_sql, repo: "hexpm", optional: false]}, {:exqlite, "~> 0.22", [hex: :exqlite, repo: "hexpm", optional: false]}], "hexpm", "5af9e031bffcc5da0b7bca90c271a7b1e7c04a93fecf7f6cd35bc1b1921a64bd"},
"elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"},
"ex_stemmers": {:hex, :ex_stemmers, "0.1.0", "63a84ae3a6f0c28a1d75768411f0ae15cfe8462fb70589b60977aa1b04c9372d", [:mix], [{:rustler, "~> 0.32.1", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm", "498826e2188e502f41d1a15f3d90e7738f0d94747e197367f03a2a44c09167c0"},
"exqlite": {:hex, :exqlite, "0.36.0", "07b4f95d61cb82b8d52946d0639497fa7d32117e09b2c8d25e24a38723c295cb", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "cbeca3ce781f9ff07cfa9a87486f3ebd512a143ad6a14ed5c9fca21fe0bf3ae7"},
"image": {:hex, :image, "0.65.0", "44908233a1a0dcdbb6ae873ec09fd9ae533d1840d300d8b0b1b186d586b935e6", [:mix], [{:color, "~> 0.4", [hex: :color, repo: "hexpm", optional: false]}, {:evision, "~> 0.1.33 or ~> 0.2", [hex: :evision, repo: "hexpm", optional: true]}, {:exla, "0.11.0", [hex: :exla, repo: "hexpm", optional: true]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: true]}, {:kino, "~> 0.13", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.11.0", [hex: :nx, repo: "hexpm", optional: true]}, {:nx_image, "~> 0.1", [hex: :nx_image, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.1 or ~> 3.2 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.13", [hex: :plug, repo: "hexpm", optional: true]}, {:req, "~> 0.4", [hex: :req, repo: "hexpm", optional: true]}, {:rustler, "> 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:scholar, "~> 0.3", [hex: :scholar, repo: "hexpm", optional: true]}, {:sweet_xml, "~> 0.7", [hex: :sweet_xml, repo: "hexpm", optional: false]}, {:vix, "~> 0.33", [hex: :vix, repo: "hexpm", optional: false]}, {:xav, "~> 0.10", [hex: :xav, repo: "hexpm", optional: true]}], "hexpm", "d2060e08d0f42564f49de1ea97a82a5d237f9ac91edb141dece51f1238dd8b4a"},
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
@@ -15,7 +16,10 @@
"phoenix_html": {:hex, :phoenix_html, "4.3.0", "d3577a5df4b6954cd7890c84d955c470b5310bb49647f0a114a6eeecc850f7ad", [:mix], [], "hexpm", "3eaa290a78bab0f075f791a46a981bbe769d94bc776869f4f3063a14f30497ad"},
"plug": {:hex, :plug, "1.19.1", "09bac17ae7a001a68ae393658aa23c7e38782be5c5c00c80be82901262c394c0", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "560a0017a8f6d5d30146916862aaf9300b7280063651dd7e532b8be168511e62"},
"plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"},
"rustler": {:hex, :rustler, "0.32.1", "f4cf5a39f9e85d182c0a3f75fa15b5d0add6542ab0bf9ceac6b4023109ebd3fc", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "b96be75526784f86f6587f051bc8d6f4eaff23d6e0f88dbcfe4d5871f52946f7"},
"stemex": {:hex, :stemex, "0.2.1", "47017c6b10cdd6926a0d523ccf1f801c5f3faf5a0a9c862f49304e07f9b5584f", [:mix], [], "hexpm", "dbfc76d27adfa31d831d183979c595942884e6530a4496714aa5b70d0964c2e4"},
"sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"},
"telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"},
"toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"},
"vix": {:hex, :vix, "0.38.0", "77529ee4f6ced339c3d5f90a9eacf306f5b7109d3d1b5e3ef391a984ad404f75", [:make, :mix], [{:cc_precompiler, "~> 0.1.4 or ~> 0.2", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.7.3 or ~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}], "hexpm", "dca58f654922fa678d5df8e028317483d9c0f8acb2e2714076a8468695687aa7"},
}

View File

@@ -224,4 +224,40 @@ defmodule BDS.SearchTest do
assert {:ok, media_results} = BDS.Search.search_media(project.id, "imported", %{})
assert Enum.map(media_results.media, & &1.id) == ["search-media-from-file"]
end
test "search_posts applies language-aware stemming to indexed and query text", %{project: project} do
assert {:ok, german_post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Morgenroutine",
content: "Die Katzen schlafen am Fenster.",
language: "de"
})
assert {:ok, french_post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Routine matinale",
content: "Je cours chaque matin avant le travail.",
language: "fr"
})
assert {:ok, german_results} = BDS.Search.search_posts(project.id, "katze", %{})
assert Enum.map(german_results.posts, & &1.id) == [german_post.id]
assert {:ok, french_results} = BDS.Search.search_posts(project.id, "courir", %{})
assert Enum.map(french_results.posts, & &1.id) == [french_post.id]
end
test "lists supported stemmer languages using normalized ISO codes" do
languages = BDS.Search.list_stemmer_languages()
assert is_list(languages)
assert "en" in languages
assert "de" in languages
assert "fr" in languages
assert "it" in languages
assert "es" in languages
assert Enum.uniq(languages) == languages
end
end