feat: finalisation for search
This commit is contained in:
@@ -9,12 +9,63 @@ defmodule BDS.Search do
|
||||
alias BDS.Projects
|
||||
alias BDS.Repo
|
||||
|
||||
@stemmer_algorithms %{
|
||||
"da" => :danish,
|
||||
"nl" => :dutch,
|
||||
"en" => :english,
|
||||
"fi" => :finnish,
|
||||
"fr" => :french,
|
||||
"de" => :german,
|
||||
"hu" => :hungarian,
|
||||
"it" => :italian,
|
||||
"no" => :norwegian,
|
||||
"pt" => :portuguese,
|
||||
"ro" => :romanian,
|
||||
"ru" => :russian,
|
||||
"es" => :spanish,
|
||||
"sv" => :swedish,
|
||||
"tr" => :turkish
|
||||
}
|
||||
|
||||
@language_hints [
|
||||
{"de", ~w(der die das und ein eine ist sind läuft laufen morgen fluss entlang)},
|
||||
{"fr", ~w(je tu il elle nous vous ils elles le la les un une des courir cours matin travail)},
|
||||
{"es", ~w(el la los las un una unos unas y que para con pero)},
|
||||
{"it", ~w(il lo la gli le un una uno e che per con ogni mattina)}
|
||||
]
|
||||
|
||||
def list_stemmer_languages do
|
||||
@stemmer_algorithms
|
||||
|> Map.keys()
|
||||
|> Enum.sort()
|
||||
end
|
||||
|
||||
def detect_language(text) do
|
||||
normalized_text = text |> to_string() |> String.downcase()
|
||||
|
||||
cond do
|
||||
normalized_text == "" -> "en"
|
||||
String.match?(normalized_text, ~r/[äöüß]/u) -> "de"
|
||||
String.match?(normalized_text, ~r/[àâçéèêëîïôùûüÿœ]/u) -> "fr"
|
||||
String.match?(normalized_text, ~r/[ñ¡¿]/u) -> "es"
|
||||
true -> detect_language_from_hints(normalized_text)
|
||||
end
|
||||
end
|
||||
|
||||
def stem(text, language \\ nil) do
|
||||
language = normalize_language(language || detect_language(text))
|
||||
|
||||
text
|
||||
|> tokenize_text()
|
||||
|> Enum.map_join(" ", &stem_token(&1, language))
|
||||
end
|
||||
|
||||
def search_posts(project_id, query, filters \\ %{}) do
|
||||
filters = normalize_filters(filters)
|
||||
|
||||
posts =
|
||||
project_id
|
||||
|> candidate_post_ids(query)
|
||||
|> candidate_post_ids(query, filters.language)
|
||||
|> load_posts_in_order()
|
||||
|> filter_posts(filters)
|
||||
|
||||
@@ -32,7 +83,7 @@ defmodule BDS.Search do
|
||||
|
||||
media_items =
|
||||
project_id
|
||||
|> candidate_media_ids(query)
|
||||
|> candidate_media_ids(query, filters.language)
|
||||
|> load_media_in_order()
|
||||
|
||||
{:ok,
|
||||
@@ -111,10 +162,12 @@ defmodule BDS.Search do
|
||||
:ok
|
||||
end
|
||||
|
||||
defp candidate_post_ids(project_id, query) do
|
||||
defp candidate_post_ids(project_id, query, language) do
|
||||
if blank_query?(query) do
|
||||
Repo.all(from post in Post, where: post.project_id == ^project_id, select: post.id)
|
||||
else
|
||||
match_query = build_match_query(query, language)
|
||||
|
||||
Repo.query!(
|
||||
"""
|
||||
SELECT posts_fts.post_id
|
||||
@@ -123,16 +176,18 @@ defmodule BDS.Search do
|
||||
WHERE posts.project_id = ? AND posts_fts MATCH ?
|
||||
ORDER BY bm25(posts_fts), posts_fts.rowid
|
||||
""",
|
||||
[project_id, query]
|
||||
[project_id, match_query]
|
||||
).rows
|
||||
|> Enum.map(fn [post_id] -> post_id end)
|
||||
end
|
||||
end
|
||||
|
||||
defp candidate_media_ids(project_id, query) do
|
||||
defp candidate_media_ids(project_id, query, language) do
|
||||
if blank_query?(query) do
|
||||
Repo.all(from media in Media, where: media.project_id == ^project_id, select: media.id)
|
||||
else
|
||||
match_query = build_match_query(query, language)
|
||||
|
||||
Repo.query!(
|
||||
"""
|
||||
SELECT media_fts.media_id
|
||||
@@ -141,7 +196,7 @@ defmodule BDS.Search do
|
||||
WHERE media.project_id = ? AND media_fts MATCH ?
|
||||
ORDER BY bm25(media_fts), media_fts.rowid
|
||||
""",
|
||||
[project_id, query]
|
||||
[project_id, match_query]
|
||||
).rows
|
||||
|> Enum.map(fn [media_id] -> media_id end)
|
||||
end
|
||||
@@ -242,16 +297,23 @@ defmodule BDS.Search do
|
||||
|
||||
defp post_index_fields(post) do
|
||||
translations = post_translations(post.id)
|
||||
post_language = normalize_language(post.language)
|
||||
|
||||
title = [post.title | Enum.map(translations, &Map.get(&1, "title"))] |> join_text()
|
||||
excerpt = [post.excerpt | Enum.map(translations, &Map.get(&1, "excerpt"))] |> join_text()
|
||||
|
||||
content =
|
||||
[post_content(post) | Enum.map(translations, &translation_content(post.project_id, &1))]
|
||||
title =
|
||||
[stem(post.title, post_language) | Enum.map(translations, &stem(Map.get(&1, "title"), Map.get(&1, "language")))]
|
||||
|> join_text()
|
||||
|
||||
tags = Enum.join(post.tags || [], " ")
|
||||
categories = Enum.join(post.categories || [], " ")
|
||||
excerpt =
|
||||
[stem(post.excerpt, post_language) | Enum.map(translations, &stem(Map.get(&1, "excerpt"), Map.get(&1, "language")))]
|
||||
|> join_text()
|
||||
|
||||
content =
|
||||
[stem(post_content(post), post_language) |
|
||||
Enum.map(translations, &stem(translation_content(post.project_id, &1), Map.get(&1, "language")))]
|
||||
|> join_text()
|
||||
|
||||
tags = stem(Enum.join(post.tags || [], " "), post_language)
|
||||
categories = stem(Enum.join(post.categories || [], " "), post_language)
|
||||
|
||||
{title, excerpt, content, tags, categories}
|
||||
end
|
||||
@@ -260,11 +322,17 @@ defmodule BDS.Search do
|
||||
translations =
|
||||
Repo.all(from translation in MediaTranslation, where: translation.translation_for == ^media.id)
|
||||
|
||||
title = [media.title | Enum.map(translations, & &1.title)] |> join_text()
|
||||
alt = [media.alt | Enum.map(translations, & &1.alt)] |> join_text()
|
||||
caption = [media.caption | Enum.map(translations, & &1.caption)] |> join_text()
|
||||
original_name = media.original_name || ""
|
||||
tags = Enum.join(media.tags || [], " ")
|
||||
media_language = normalize_language(media.language)
|
||||
|
||||
title = [stem(media.title, media_language) | Enum.map(translations, &stem(&1.title, &1.language))] |> join_text()
|
||||
alt = [stem(media.alt, media_language) | Enum.map(translations, &stem(&1.alt, &1.language))] |> join_text()
|
||||
|
||||
caption =
|
||||
[stem(media.caption, media_language) | Enum.map(translations, &stem(&1.caption, &1.language))]
|
||||
|> join_text()
|
||||
|
||||
original_name = stem(media.original_name || "", media_language)
|
||||
tags = stem(Enum.join(media.tags || [], " "), media_language)
|
||||
|
||||
{title, alt, caption, original_name, tags}
|
||||
end
|
||||
@@ -329,6 +397,73 @@ defmodule BDS.Search do
|
||||
|> Enum.join("\n")
|
||||
end
|
||||
|
||||
defp build_match_query(query, language) do
|
||||
query
|
||||
|> query_variants(language)
|
||||
|> Enum.map_join(" OR ", fn tokens ->
|
||||
tokens
|
||||
|> Enum.map_join(" AND ", "ed_term/1)
|
||||
|> then(&"(" <> &1 <> ")")
|
||||
end)
|
||||
end
|
||||
|
||||
defp query_variants(query, language) do
|
||||
languages = query_languages(query, language)
|
||||
tokens = tokenize_text(query)
|
||||
|
||||
languages
|
||||
|> Enum.map(fn stemmer_language -> Enum.map(tokens, &stem_token(&1, stemmer_language)) end)
|
||||
|> Enum.reject(&Enum.empty?/1)
|
||||
|> Enum.uniq()
|
||||
end
|
||||
|
||||
defp query_languages(query, nil) do
|
||||
detected = detect_language(query)
|
||||
|
||||
([detected] ++ list_stemmer_languages())
|
||||
|> Enum.uniq()
|
||||
end
|
||||
|
||||
defp query_languages(_query, language), do: [normalize_language(language)]
|
||||
|
||||
defp quoted_term(term), do: ~s("#{String.replace(term, ~s("), ~s(\"))}")
|
||||
|
||||
defp tokenize_text(nil), do: []
|
||||
|
||||
defp tokenize_text(text) do
|
||||
Regex.scan(~r/[[:alnum:]]+/u, to_string(text))
|
||||
|> List.flatten()
|
||||
|> Enum.map(&String.downcase/1)
|
||||
end
|
||||
|
||||
defp stem_token(token, language) do
|
||||
case Map.fetch(@stemmer_algorithms, normalize_language(language)) do
|
||||
{:ok, algorithm} -> apply(Stemex, algorithm, [token])
|
||||
:error -> token
|
||||
end
|
||||
rescue
|
||||
_error -> token
|
||||
end
|
||||
|
||||
defp normalize_language(nil), do: "en"
|
||||
|
||||
defp normalize_language(language) do
|
||||
language
|
||||
|> to_string()
|
||||
|> String.downcase()
|
||||
|> String.split("-", parts: 2)
|
||||
|> hd()
|
||||
|> then(fn code -> if Map.has_key?(@stemmer_algorithms, code), do: code, else: "en" end)
|
||||
end
|
||||
|
||||
defp detect_language_from_hints(text) do
|
||||
tokens = MapSet.new(tokenize_text(text))
|
||||
|
||||
Enum.find_value(@language_hints, "en", fn {language, hints} ->
|
||||
if Enum.any?(hints, &MapSet.member?(tokens, &1)), do: language, else: false
|
||||
end)
|
||||
end
|
||||
|
||||
defp normalize_filters(filters) do
|
||||
%{
|
||||
status: attr(filters, :status),
|
||||
|
||||
3
mix.exs
3
mix.exs
@@ -26,7 +26,8 @@ defmodule BDS.MixProject do
|
||||
{:luerl, "~> 1.5"},
|
||||
{:jason, "~> 1.4"},
|
||||
{:plug, "~> 1.18"},
|
||||
{:image, "~> 0.65"}
|
||||
{:image, "~> 0.65"},
|
||||
{:stemex, "~> 0.2.1"}
|
||||
]
|
||||
end
|
||||
|
||||
|
||||
4
mix.lock
4
mix.lock
@@ -7,6 +7,7 @@
|
||||
"ecto_sql": {:hex, :ecto_sql, "3.13.5", "2f8282b2ad97bf0f0d3217ea0a6fff320ead9e2f8770f810141189d182dc304e", [:mix], [{:db_connection, "~> 2.4.1 or ~> 2.5", [hex: :db_connection, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:myxql, "~> 0.7", [hex: :myxql, repo: "hexpm", optional: true]}, {:postgrex, "~> 0.19 or ~> 1.0", [hex: :postgrex, repo: "hexpm", optional: true]}, {:tds, "~> 2.1.1 or ~> 2.2", [hex: :tds, repo: "hexpm", optional: true]}, {:telemetry, "~> 0.4.0 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "aa36751f4e6a2b56ae79efb0e088042e010ff4935fc8684e74c23b1f49e25fdc"},
|
||||
"ecto_sqlite3": {:hex, :ecto_sqlite3, "0.22.0", "edab2d0f701b7dd05dcf7e2d97769c106aff62b5cfddc000d1dd6f46b9cbd8c3", [:mix], [{:decimal, "~> 1.6 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}, {:ecto, "~> 3.13.0", [hex: :ecto, repo: "hexpm", optional: false]}, {:ecto_sql, "~> 3.13.0", [hex: :ecto_sql, repo: "hexpm", optional: false]}, {:exqlite, "~> 0.22", [hex: :exqlite, repo: "hexpm", optional: false]}], "hexpm", "5af9e031bffcc5da0b7bca90c271a7b1e7c04a93fecf7f6cd35bc1b1921a64bd"},
|
||||
"elixir_make": {:hex, :elixir_make, "0.9.0", "6484b3cd8c0cee58f09f05ecaf1a140a8c97670671a6a0e7ab4dc326c3109726", [:mix], [], "hexpm", "db23d4fd8b757462ad02f8aa73431a426fe6671c80b200d9710caf3d1dd0ffdb"},
|
||||
"ex_stemmers": {:hex, :ex_stemmers, "0.1.0", "63a84ae3a6f0c28a1d75768411f0ae15cfe8462fb70589b60977aa1b04c9372d", [:mix], [{:rustler, "~> 0.32.1", [hex: :rustler, repo: "hexpm", optional: false]}], "hexpm", "498826e2188e502f41d1a15f3d90e7738f0d94747e197367f03a2a44c09167c0"},
|
||||
"exqlite": {:hex, :exqlite, "0.36.0", "07b4f95d61cb82b8d52946d0639497fa7d32117e09b2c8d25e24a38723c295cb", [:make, :mix], [{:cc_precompiler, "~> 0.1", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:db_connection, "~> 2.1", [hex: :db_connection, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:table, "~> 0.1.0", [hex: :table, repo: "hexpm", optional: true]}], "hexpm", "cbeca3ce781f9ff07cfa9a87486f3ebd512a143ad6a14ed5c9fca21fe0bf3ae7"},
|
||||
"image": {:hex, :image, "0.65.0", "44908233a1a0dcdbb6ae873ec09fd9ae533d1840d300d8b0b1b186d586b935e6", [:mix], [{:color, "~> 0.4", [hex: :color, repo: "hexpm", optional: false]}, {:evision, "~> 0.1.33 or ~> 0.2", [hex: :evision, repo: "hexpm", optional: true]}, {:exla, "0.11.0", [hex: :exla, repo: "hexpm", optional: true]}, {:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: true]}, {:kino, "~> 0.13", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.11.0", [hex: :nx, repo: "hexpm", optional: true]}, {:nx_image, "~> 0.1", [hex: :nx_image, repo: "hexpm", optional: true]}, {:phoenix_html, "~> 2.1 or ~> 3.2 or ~> 4.0", [hex: :phoenix_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.13", [hex: :plug, repo: "hexpm", optional: true]}, {:req, "~> 0.4", [hex: :req, repo: "hexpm", optional: true]}, {:rustler, "> 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:scholar, "~> 0.3", [hex: :scholar, repo: "hexpm", optional: true]}, {:sweet_xml, "~> 0.7", [hex: :sweet_xml, repo: "hexpm", optional: false]}, {:vix, "~> 0.33", [hex: :vix, repo: "hexpm", optional: false]}, {:xav, "~> 0.10", [hex: :xav, repo: "hexpm", optional: true]}], "hexpm", "d2060e08d0f42564f49de1ea97a82a5d237f9ac91edb141dece51f1238dd8b4a"},
|
||||
"jason": {:hex, :jason, "1.4.4", "b9226785a9aa77b6857ca22832cffa5d5011a667207eb2a0ad56adb5db443b8a", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "c5eb0cab91f094599f94d55bc63409236a8ec69a21a67814529e8d5f6cc90b3b"},
|
||||
@@ -15,7 +16,10 @@
|
||||
"phoenix_html": {:hex, :phoenix_html, "4.3.0", "d3577a5df4b6954cd7890c84d955c470b5310bb49647f0a114a6eeecc850f7ad", [:mix], [], "hexpm", "3eaa290a78bab0f075f791a46a981bbe769d94bc776869f4f3063a14f30497ad"},
|
||||
"plug": {:hex, :plug, "1.19.1", "09bac17ae7a001a68ae393658aa23c7e38782be5c5c00c80be82901262c394c0", [:mix], [{:mime, "~> 1.0 or ~> 2.0", [hex: :mime, repo: "hexpm", optional: false]}, {:plug_crypto, "~> 1.1.1 or ~> 1.2 or ~> 2.0", [hex: :plug_crypto, repo: "hexpm", optional: false]}, {:telemetry, "~> 0.4.3 or ~> 1.0", [hex: :telemetry, repo: "hexpm", optional: false]}], "hexpm", "560a0017a8f6d5d30146916862aaf9300b7280063651dd7e532b8be168511e62"},
|
||||
"plug_crypto": {:hex, :plug_crypto, "2.1.1", "19bda8184399cb24afa10be734f84a16ea0a2bc65054e23a62bb10f06bc89491", [:mix], [], "hexpm", "6470bce6ffe41c8bd497612ffde1a7e4af67f36a15eea5f921af71cf3e11247c"},
|
||||
"rustler": {:hex, :rustler, "0.32.1", "f4cf5a39f9e85d182c0a3f75fa15b5d0add6542ab0bf9ceac6b4023109ebd3fc", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "b96be75526784f86f6587f051bc8d6f4eaff23d6e0f88dbcfe4d5871f52946f7"},
|
||||
"stemex": {:hex, :stemex, "0.2.1", "47017c6b10cdd6926a0d523ccf1f801c5f3faf5a0a9c862f49304e07f9b5584f", [:mix], [], "hexpm", "dbfc76d27adfa31d831d183979c595942884e6530a4496714aa5b70d0964c2e4"},
|
||||
"sweet_xml": {:hex, :sweet_xml, "0.7.5", "803a563113981aaac202a1dbd39771562d0ad31004ddbfc9b5090bdcd5605277", [:mix], [], "hexpm", "193b28a9b12891cae351d81a0cead165ffe67df1b73fe5866d10629f4faefb12"},
|
||||
"telemetry": {:hex, :telemetry, "1.4.1", "ab6de178e2b29b58e8256b92b382ea3f590a47152ca3651ea857a6cae05ac423", [:rebar3], [], "hexpm", "2172e05a27531d3d31dd9782841065c50dd5c3c7699d95266b2edd54c2dafa1c"},
|
||||
"toml": {:hex, :toml, "0.7.0", "fbcd773caa937d0c7a02c301a1feea25612720ac3fa1ccb8bfd9d30d822911de", [:mix], [], "hexpm", "0690246a2478c1defd100b0c9b89b4ea280a22be9a7b313a8a058a2408a2fa70"},
|
||||
"vix": {:hex, :vix, "0.38.0", "77529ee4f6ced339c3d5f90a9eacf306f5b7109d3d1b5e3ef391a984ad404f75", [:make, :mix], [{:cc_precompiler, "~> 0.1.4 or ~> 0.2", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.7.3 or ~> 0.8", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}], "hexpm", "dca58f654922fa678d5df8e028317483d9c0f8acb2e2714076a8468695687aa7"},
|
||||
}
|
||||
|
||||
@@ -224,4 +224,40 @@ defmodule BDS.SearchTest do
|
||||
assert {:ok, media_results} = BDS.Search.search_media(project.id, "imported", %{})
|
||||
assert Enum.map(media_results.media, & &1.id) == ["search-media-from-file"]
|
||||
end
|
||||
|
||||
test "search_posts applies language-aware stemming to indexed and query text", %{project: project} do
|
||||
assert {:ok, german_post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Morgenroutine",
|
||||
content: "Die Katzen schlafen am Fenster.",
|
||||
language: "de"
|
||||
})
|
||||
|
||||
assert {:ok, french_post} =
|
||||
BDS.Posts.create_post(%{
|
||||
project_id: project.id,
|
||||
title: "Routine matinale",
|
||||
content: "Je cours chaque matin avant le travail.",
|
||||
language: "fr"
|
||||
})
|
||||
|
||||
assert {:ok, german_results} = BDS.Search.search_posts(project.id, "katze", %{})
|
||||
assert Enum.map(german_results.posts, & &1.id) == [german_post.id]
|
||||
|
||||
assert {:ok, french_results} = BDS.Search.search_posts(project.id, "courir", %{})
|
||||
assert Enum.map(french_results.posts, & &1.id) == [french_post.id]
|
||||
end
|
||||
|
||||
test "lists supported stemmer languages using normalized ISO codes" do
|
||||
languages = BDS.Search.list_stemmer_languages()
|
||||
|
||||
assert is_list(languages)
|
||||
assert "en" in languages
|
||||
assert "de" in languages
|
||||
assert "fr" in languages
|
||||
assert "it" in languages
|
||||
assert "es" in languages
|
||||
assert Enum.uniq(languages) == languages
|
||||
end
|
||||
end
|
||||
|
||||
Reference in New Issue
Block a user