diff --git a/lib/bds/search.ex b/lib/bds/search.ex index 7b42293..ebb3353 100644 --- a/lib/bds/search.ex +++ b/lib/bds/search.ex @@ -9,12 +9,63 @@ defmodule BDS.Search do alias BDS.Projects alias BDS.Repo + @stemmer_algorithms %{ + "da" => :danish, + "nl" => :dutch, + "en" => :english, + "fi" => :finnish, + "fr" => :french, + "de" => :german, + "hu" => :hungarian, + "it" => :italian, + "no" => :norwegian, + "pt" => :portuguese, + "ro" => :romanian, + "ru" => :russian, + "es" => :spanish, + "sv" => :swedish, + "tr" => :turkish + } + + @language_hints [ + {"de", ~w(der die das und ein eine ist sind läuft laufen morgen fluss entlang)}, + {"fr", ~w(je tu il elle nous vous ils elles le la les un une des courir cours matin travail)}, + {"es", ~w(el la los las un una unos unas y que para con pero)}, + {"it", ~w(il lo la gli le un una uno e che per con ogni mattina)} + ] + + def list_stemmer_languages do + @stemmer_algorithms + |> Map.keys() + |> Enum.sort() + end + + def detect_language(text) do + normalized_text = text |> to_string() |> String.downcase() + + cond do + normalized_text == "" -> "en" + String.match?(normalized_text, ~r/[äöüß]/u) -> "de" + String.match?(normalized_text, ~r/[àâçéèêëîïôùûüÿœ]/u) -> "fr" + String.match?(normalized_text, ~r/[ñ¡¿]/u) -> "es" + true -> detect_language_from_hints(normalized_text) + end + end + + def stem(text, language \\ nil) do + language = normalize_language(language || detect_language(text)) + + text + |> tokenize_text() + |> Enum.map_join(" ", &stem_token(&1, language)) + end + def search_posts(project_id, query, filters \\ %{}) do filters = normalize_filters(filters) posts = project_id - |> candidate_post_ids(query) + |> candidate_post_ids(query, filters.language) |> load_posts_in_order() |> filter_posts(filters) @@ -32,7 +83,7 @@ defmodule BDS.Search do media_items = project_id - |> candidate_media_ids(query) + |> candidate_media_ids(query, filters.language) |> load_media_in_order() {:ok, @@ -111,10 +162,12 @@ defmodule BDS.Search do :ok end - defp candidate_post_ids(project_id, query) do + defp candidate_post_ids(project_id, query, language) do if blank_query?(query) do Repo.all(from post in Post, where: post.project_id == ^project_id, select: post.id) else + match_query = build_match_query(query, language) + Repo.query!( """ SELECT posts_fts.post_id @@ -123,16 +176,18 @@ defmodule BDS.Search do WHERE posts.project_id = ? AND posts_fts MATCH ? ORDER BY bm25(posts_fts), posts_fts.rowid """, - [project_id, query] + [project_id, match_query] ).rows |> Enum.map(fn [post_id] -> post_id end) end end - defp candidate_media_ids(project_id, query) do + defp candidate_media_ids(project_id, query, language) do if blank_query?(query) do Repo.all(from media in Media, where: media.project_id == ^project_id, select: media.id) else + match_query = build_match_query(query, language) + Repo.query!( """ SELECT media_fts.media_id @@ -141,7 +196,7 @@ defmodule BDS.Search do WHERE media.project_id = ? AND media_fts MATCH ? ORDER BY bm25(media_fts), media_fts.rowid """, - [project_id, query] + [project_id, match_query] ).rows |> Enum.map(fn [media_id] -> media_id end) end @@ -242,16 +297,23 @@ defmodule BDS.Search do defp post_index_fields(post) do translations = post_translations(post.id) + post_language = normalize_language(post.language) - title = [post.title | Enum.map(translations, &Map.get(&1, "title"))] |> join_text() - excerpt = [post.excerpt | Enum.map(translations, &Map.get(&1, "excerpt"))] |> join_text() - - content = - [post_content(post) | Enum.map(translations, &translation_content(post.project_id, &1))] + title = + [stem(post.title, post_language) | Enum.map(translations, &stem(Map.get(&1, "title"), Map.get(&1, "language")))] |> join_text() - tags = Enum.join(post.tags || [], " ") - categories = Enum.join(post.categories || [], " ") + excerpt = + [stem(post.excerpt, post_language) | Enum.map(translations, &stem(Map.get(&1, "excerpt"), Map.get(&1, "language")))] + |> join_text() + + content = + [stem(post_content(post), post_language) | + Enum.map(translations, &stem(translation_content(post.project_id, &1), Map.get(&1, "language")))] + |> join_text() + + tags = stem(Enum.join(post.tags || [], " "), post_language) + categories = stem(Enum.join(post.categories || [], " "), post_language) {title, excerpt, content, tags, categories} end @@ -260,11 +322,17 @@ defmodule BDS.Search do translations = Repo.all(from translation in MediaTranslation, where: translation.translation_for == ^media.id) - title = [media.title | Enum.map(translations, & &1.title)] |> join_text() - alt = [media.alt | Enum.map(translations, & &1.alt)] |> join_text() - caption = [media.caption | Enum.map(translations, & &1.caption)] |> join_text() - original_name = media.original_name || "" - tags = Enum.join(media.tags || [], " ") + media_language = normalize_language(media.language) + + title = [stem(media.title, media_language) | Enum.map(translations, &stem(&1.title, &1.language))] |> join_text() + alt = [stem(media.alt, media_language) | Enum.map(translations, &stem(&1.alt, &1.language))] |> join_text() + + caption = + [stem(media.caption, media_language) | Enum.map(translations, &stem(&1.caption, &1.language))] + |> join_text() + + original_name = stem(media.original_name || "", media_language) + tags = stem(Enum.join(media.tags || [], " "), media_language) {title, alt, caption, original_name, tags} end @@ -329,6 +397,73 @@ defmodule BDS.Search do |> Enum.join("\n") end + defp build_match_query(query, language) do + query + |> query_variants(language) + |> Enum.map_join(" OR ", fn tokens -> + tokens + |> Enum.map_join(" AND ", "ed_term/1) + |> then(&"(" <> &1 <> ")") + end) + end + + defp query_variants(query, language) do + languages = query_languages(query, language) + tokens = tokenize_text(query) + + languages + |> Enum.map(fn stemmer_language -> Enum.map(tokens, &stem_token(&1, stemmer_language)) end) + |> Enum.reject(&Enum.empty?/1) + |> Enum.uniq() + end + + defp query_languages(query, nil) do + detected = detect_language(query) + + ([detected] ++ list_stemmer_languages()) + |> Enum.uniq() + end + + defp query_languages(_query, language), do: [normalize_language(language)] + + defp quoted_term(term), do: ~s("#{String.replace(term, ~s("), ~s(\"))}") + + defp tokenize_text(nil), do: [] + + defp tokenize_text(text) do + Regex.scan(~r/[[:alnum:]]+/u, to_string(text)) + |> List.flatten() + |> Enum.map(&String.downcase/1) + end + + defp stem_token(token, language) do + case Map.fetch(@stemmer_algorithms, normalize_language(language)) do + {:ok, algorithm} -> apply(Stemex, algorithm, [token]) + :error -> token + end + rescue + _error -> token + end + + defp normalize_language(nil), do: "en" + + defp normalize_language(language) do + language + |> to_string() + |> String.downcase() + |> String.split("-", parts: 2) + |> hd() + |> then(fn code -> if Map.has_key?(@stemmer_algorithms, code), do: code, else: "en" end) + end + + defp detect_language_from_hints(text) do + tokens = MapSet.new(tokenize_text(text)) + + Enum.find_value(@language_hints, "en", fn {language, hints} -> + if Enum.any?(hints, &MapSet.member?(tokens, &1)), do: language, else: false + end) + end + defp normalize_filters(filters) do %{ status: attr(filters, :status), diff --git a/mix.exs b/mix.exs index 94a6d9d..85cc95c 100644 --- a/mix.exs +++ b/mix.exs @@ -26,7 +26,8 @@ defmodule BDS.MixProject do {:luerl, "~> 1.5"}, {:jason, "~> 1.4"}, {:plug, "~> 1.18"}, - {:image, "~> 0.65"} + {:image, "~> 0.65"}, + {:stemex, "~> 0.2.1"} ] end diff --git a/mix.lock b/mix.lock index 5e39534..24d8f00 100644 --- a/mix.lock +++ b/mix.lock @@ -7,6 +7,7 @@ "ecto_sql": {:hex, :ecto_sql, "3.13.5", "2f8282b2ad97bf0f0d3217ea0a6fff320ead9e2f8770f810141189d182dc304e", [:mix], [{:db_connection, "~> 2.4.1 or ~> 2.5", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.7", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.19 or ~> 1.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.1 or ~> 2.2", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "aa36751f4e6a2b56ae79efb0e088042e010ff4935fc8684e74c23b1f49e25fdc"}, "ecto_sqlite3": {:hex, :ecto_sqlite3, "0.22.0", "edab2d0f701b7dd05dcf7e2d97769c106aff62b5cfddc000d1dd6f46b9cbd8c3", [:mix], [{:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:ecto_sql, "~> 3.13.0", [hex: :ecto_sql, repo: "hexpm", optional: false]}, {:exqlite, "~> 0.22", [hex: :exqlite, repo: "hexpm", optional: false]}], "hexpm", "5af9e031bffcc5da0b7bca90c271a7b1e7c04a93fecf7f6cd35bc1b1921a64bd"}, "elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"}, + "ex_stemmers": {:hex, :ex_stemmers, "0.1.0", "63a84ae3a6f0c28a1d75768411f0ae15cfe8462fb70589b60977aa1b04c9372d", [:mix], [{:rustler, "~> 0.32.1", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm", "498826e2188e502f41d1a15f3d90e7738f0d94747e197367f03a2a44c09167c0"}, "exqlite": {:hex, :exqlite, "0.36.0", "07b4f95d61cb82b8d52946d0639497fa7d32117e09b2c8d25e24a38723c295cb", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "cbeca3ce781f9ff07cfa9a87486f3ebd512a143ad6a14ed5c9fca21fe0bf3ae7"}, "image": {:hex, :image, "0.65.0", "44908233a1a0dcdbb6ae873ec09fd9ae533d1840d300d8b0b1b186d586b935e6", [:mix], [{:color, "~> 0.4", [hex: :color, repo: "hexpm", optional: false]}, {:evision, "~> 0.1.33 or ~> 0.2", [hex: :evision, repo: "hexpm", optional: true]}, {:exla, "0.11.0", [hex: :exla, repo: "hexpm", optional: true]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: true]}, {:kino, "~> 0.13", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.11.0", [hex: :nx, repo: "hexpm", optional: true]}, {:nx_image, "~> 0.1", [hex: :nx_image, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.1 or ~> 3.2 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.13", [hex: :plug, repo: "hexpm", optional: true]}, {:req, "~> 0.4", [hex: :req, repo: "hexpm", optional: true]}, {:rustler, "> 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:scholar, "~> 0.3", [hex: :scholar, repo: "hexpm", optional: true]}, {:sweet_xml, "~> 0.7", [hex: :sweet_xml, repo: "hexpm", optional: false]}, {:vix, "~> 0.33", [hex: :vix, repo: "hexpm", optional: false]}, {:xav, "~> 0.10", [hex: :xav, repo: "hexpm", optional: true]}], "hexpm", "d2060e08d0f42564f49de1ea97a82a5d237f9ac91edb141dece51f1238dd8b4a"}, "jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"}, @@ -15,7 +16,10 @@ "phoenix_html": {:hex, :phoenix_html, "4.3.0", "d3577a5df4b6954cd7890c84d955c470b5310bb49647f0a114a6eeecc850f7ad", [:mix], [], "hexpm", "3eaa290a78bab0f075f791a46a981bbe769d94bc776869f4f3063a14f30497ad"}, "plug": {:hex, :plug, "1.19.1", "09bac17ae7a001a68ae393658aa23c7e38782be5c5c00c80be82901262c394c0", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "560a0017a8f6d5d30146916862aaf9300b7280063651dd7e532b8be168511e62"}, "plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"}, + "rustler": {:hex, :rustler, "0.32.1", "f4cf5a39f9e85d182c0a3f75fa15b5d0add6542ab0bf9ceac6b4023109ebd3fc", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "b96be75526784f86f6587f051bc8d6f4eaff23d6e0f88dbcfe4d5871f52946f7"}, + "stemex": {:hex, :stemex, "0.2.1", "47017c6b10cdd6926a0d523ccf1f801c5f3faf5a0a9c862f49304e07f9b5584f", [:mix], [], "hexpm", "dbfc76d27adfa31d831d183979c595942884e6530a4496714aa5b70d0964c2e4"}, "sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"}, "telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"}, + "toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"}, "vix": {:hex, :vix, "0.38.0", "77529ee4f6ced339c3d5f90a9eacf306f5b7109d3d1b5e3ef391a984ad404f75", [:make, :mix], [{:cc_precompiler, "~> 0.1.4 or ~> 0.2", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.7.3 or ~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}], "hexpm", "dca58f654922fa678d5df8e028317483d9c0f8acb2e2714076a8468695687aa7"}, } diff --git a/test/bds/search_test.exs b/test/bds/search_test.exs index 10913ff..6ef75d5 100644 --- a/test/bds/search_test.exs +++ b/test/bds/search_test.exs @@ -224,4 +224,40 @@ defmodule BDS.SearchTest do assert {:ok, media_results} = BDS.Search.search_media(project.id, "imported", %{}) assert Enum.map(media_results.media, & &1.id) == ["search-media-from-file"] end + + test "search_posts applies language-aware stemming to indexed and query text", %{project: project} do + assert {:ok, german_post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Morgenroutine", + content: "Die Katzen schlafen am Fenster.", + language: "de" + }) + + assert {:ok, french_post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Routine matinale", + content: "Je cours chaque matin avant le travail.", + language: "fr" + }) + + assert {:ok, german_results} = BDS.Search.search_posts(project.id, "katze", %{}) + assert Enum.map(german_results.posts, & &1.id) == [german_post.id] + + assert {:ok, french_results} = BDS.Search.search_posts(project.id, "courir", %{}) + assert Enum.map(french_results.posts, & &1.id) == [french_post.id] + end + + test "lists supported stemmer languages using normalized ISO codes" do + languages = BDS.Search.list_stemmer_languages() + + assert is_list(languages) + assert "en" in languages + assert "de" in languages + assert "fr" in languages + assert "it" in languages + assert "es" in languages + assert Enum.uniq(languages) == languages + end end