diff --git a/lib/bds/embeddings.ex b/lib/bds/embeddings.ex new file mode 100644 index 0000000..6855d96 --- /dev/null +++ b/lib/bds/embeddings.ex @@ -0,0 +1,342 @@ +defmodule BDS.Embeddings do + @moduledoc false + + import Ecto.Query + + alias BDS.Embeddings.DismissedDuplicatePair + alias BDS.Embeddings.Key + alias BDS.Metadata + alias BDS.Posts.Post + alias BDS.Projects + alias BDS.Repo + + @dimensions 384 + @duplicate_threshold 0.5 + @model_id "Xenova/multilingual-e5-small" + + def model_id, do: @model_id + def dimensions, do: @dimensions + + def sync_post(%Post{} = post) do + if enabled_for_project?(post.project_id) do + body = resolve_post_body(post) + raw_text = compose_embedding_source(post.title, body) + content_hash = hash_text(raw_text) + + case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do + %Key{content_hash: ^content_hash} -> + :ok + + existing_key -> + label = existing_key_label(existing_key) || next_label() + vector = vectorize(raw_text, post.language) + + (existing_key || %Key{}) + |> Key.changeset(%{ + label: label, + post_id: post.id, + project_id: post.project_id, + content_hash: content_hash, + vector: Jason.encode!(vector) + }) + |> Repo.insert_or_update() + + :ok + end + else + :ok + end + end + + def sync_post(post_id) when is_binary(post_id) do + case Repo.get(Post, post_id) do + nil -> :ok + post -> sync_post(post) + end + end + + def remove_post(post_id) when is_binary(post_id) do + Repo.delete_all(from key in Key, where: key.post_id == ^post_id) + :ok + end + + def index_unindexed(project_id) when is_binary(project_id) do + if enabled_for_project?(project_id) do + posts = + Repo.all(from post in Post, where: post.project_id == ^project_id, order_by: [asc: post.created_at, asc: post.slug]) + + Enum.each(posts, fn post -> + body = resolve_post_body(post) + content_hash = hash_text(compose_embedding_source(post.title, body)) + + case Repo.get_by(Key, post_id: post.id, project_id: project_id) do + %Key{content_hash: ^content_hash} -> :ok + _other -> + :ok = sync_post(%{post | content: if(post.content in [nil, ""], do: body, else: post.content)}) + end + end) + + indexed = Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id) + + {:ok, indexed} + else + {:ok, []} + end + end + + def find_similar(post_id, limit \\ 5) when is_binary(post_id) and is_integer(limit) do + case source_post_and_vector(post_id) do + {:disabled, _project_id} -> {:ok, []} + {:error, :not_found} -> {:ok, []} + {:ok, post, source_vector} -> + similar = + Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id) + |> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end) + |> Enum.sort_by(& &1.score, :desc) + |> Enum.take(max(limit, 0)) + + {:ok, similar} + end + end + + def compute_similarities(source_post_id, target_post_ids) + when is_binary(source_post_id) and is_list(target_post_ids) do + case source_post_and_vector(source_post_id) do + {:disabled, _project_id} -> {:ok, %{}} + {:error, :not_found} -> {:ok, %{}} + {:ok, post, source_vector} -> + target_ids = Enum.uniq(target_post_ids) + + scores = + Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id in ^target_ids) + |> Enum.reduce(%{}, fn key, acc -> + if key.post_id == source_post_id do + acc + else + Map.put(acc, key.post_id, cosine_similarity(source_vector, decode_vector(key.vector))) + end + end) + + {:ok, scores} + end + end + + def suggest_tags(post_id, _input_text) when is_binary(post_id) do + with {:ok, _post} <- fetch_post(post_id), + {:ok, similar} <- find_similar(post_id, 10) do + suggestions = + Repo.all(from other in Post, where: other.id in ^Enum.map(similar, & &1.post_id)) + |> Map.new(&{&1.id, &1}) + |> then(fn posts_by_id -> + Enum.reduce(similar, %{}, fn %{post_id: similar_post_id, score: score}, acc -> + case Map.get(posts_by_id, similar_post_id) do + nil -> acc + similar_post -> + Enum.reduce(similar_post.tags || [], acc, fn tag, tag_acc -> + Map.update(tag_acc, tag, score, &(&1 + score)) + end) + end + end) + end) + |> Enum.sort_by(fn {_tag, score} -> score end, :desc) + |> Enum.take(5) + |> Enum.map(fn {tag, _score} -> tag end) + + {:ok, suggestions} + else + {:error, :not_found} -> {:ok, []} + {:disabled, _project_id} -> {:ok, []} + end + end + + def find_duplicates(project_id) when is_binary(project_id) do + if enabled_for_project?(project_id) do + dismissed = dismissed_pair_keys(project_id) + keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id]) + + duplicates = + for left <- keys, + right <- keys, + left.post_id < right.post_id, + pair_key(left.post_id, right.post_id) not in dismissed, + similarity = cosine_similarity(decode_vector(left.vector), decode_vector(right.vector)), + similarity >= @duplicate_threshold do + %{ + post_id_a: left.post_id, + post_id_b: right.post_id, + score: similarity + } + end + |> Enum.sort_by(& &1.score, :desc) + + {:ok, duplicates} + else + {:ok, []} + end + end + + def dismiss_duplicate_pair(post_id_a, post_id_b) + when is_binary(post_id_a) and is_binary(post_id_b) do + with {:ok, post_a} <- fetch_post(post_id_a), + {:ok, post_b} <- fetch_post(post_id_b), + true <- post_a.project_id == post_b.project_id do + {sorted_a, sorted_b} = sort_pair(post_id_a, post_id_b) + + pair = + Repo.get_by(DismissedDuplicatePair, + project_id: post_a.project_id, + post_id_a: sorted_a, + post_id_b: sorted_b + ) || %DismissedDuplicatePair{} + + saved_pair = + pair + |> DismissedDuplicatePair.changeset(%{ + id: pair.id || Ecto.UUID.generate(), + project_id: post_a.project_id, + post_id_a: sorted_a, + post_id_b: sorted_b, + dismissed_at: System.system_time(:second) + }) + |> Repo.insert_or_update!() + + {:ok, saved_pair} + else + _ -> {:error, :not_found} + end + end + + defp source_post_and_vector(post_id) do + with {:ok, post} <- fetch_post(post_id) do + if enabled_for_project?(post.project_id) do + :ok = ensure_key(post) + + case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do + nil -> {:ok, post, []} + key -> {:ok, post, decode_vector(key.vector)} + end + else + {:disabled, post.project_id} + end + end + end + + defp ensure_key(%Post{} = post) do + case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do + nil -> sync_post(post) + _key -> :ok + end + end + + defp fetch_post(post_id) do + case Repo.get(Post, post_id) do + nil -> {:error, :not_found} + post -> {:ok, post} + end + end + + defp enabled_for_project?(project_id) do + case Metadata.get_project_metadata(project_id) do + {:ok, metadata} -> metadata.semantic_similarity_enabled == true + _other -> false + end + end + + defp existing_key_label(nil), do: nil + defp existing_key_label(%Key{label: label}), do: label + + defp next_label do + Repo.one(from key in Key, select: max(key.label)) + |> case do + nil -> 1 + value -> value + 1 + end + end + + defp resolve_post_body(%Post{content: content}) when is_binary(content) and content != "", do: content + + defp resolve_post_body(%Post{project_id: project_id, file_path: file_path}) do + if file_path in [nil, ""] do + "" + else + project = Projects.get_project!(project_id) + full_path = Path.join(Projects.project_data_dir(project), file_path) + + case File.read(full_path) do + {:ok, contents} -> + case String.split(contents, "\n---\n", parts: 2) do + [_frontmatter, body] -> String.trim_trailing(body, "\n") + _parts -> contents + end + + {:error, _reason} -> + "" + end + end + end + + defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}" + + defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower) + + defp vectorize(text, language) do + stemmed = BDS.Search.stem(text, language) + tokens = tokenize(stemmed) + bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::")) + weighted_tokens = tokens ++ bigrams + vector_array = :array.new(@dimensions, default: 0.0) + + vector = + Enum.reduce(weighted_tokens, vector_array, fn token, acc -> + index = :erlang.phash2(token, @dimensions) + :array.set(index, :array.get(index, acc) + 1.0, acc) + end) + |> :array.to_list() + + normalize(vector) + end + + defp tokenize(text) do + Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text)) + |> List.flatten() + end + + defp normalize(vector) do + norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end)) + + if norm == 0.0 do + vector + else + Enum.map(vector, &(&1 / norm)) + end + end + + defp decode_vector(nil), do: [] + defp decode_vector(vector), do: Jason.decode!(vector) + + defp cosine_similarity([], _other), do: 0.0 + defp cosine_similarity(_vector, []), do: 0.0 + + defp cosine_similarity(left, right) do + Enum.zip(left, right) + |> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end) + |> max(0.0) + end + + defp dismissed_pair_keys(project_id) do + Repo.all( + from pair in DismissedDuplicatePair, + where: pair.project_id == ^project_id, + select: {pair.post_id_a, pair.post_id_b} + ) + |> MapSet.new(fn {post_id_a, post_id_b} -> pair_key(post_id_a, post_id_b) end) + end + + defp pair_key(post_id_a, post_id_b) do + {sorted_a, sorted_b} = sort_pair(post_id_a, post_id_b) + "#{sorted_a}::#{sorted_b}" + end + + defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b} + defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a} +end diff --git a/lib/bds/embeddings/dismissed_duplicate_pair.ex b/lib/bds/embeddings/dismissed_duplicate_pair.ex new file mode 100644 index 0000000..3b1533f --- /dev/null +++ b/lib/bds/embeddings/dismissed_duplicate_pair.ex @@ -0,0 +1,24 @@ +defmodule BDS.Embeddings.DismissedDuplicatePair do + @moduledoc false + + use Ecto.Schema + import Ecto.Changeset + + @primary_key {:id, :string, autogenerate: false} + @foreign_key_type :string + + schema "dismissed_duplicate_pairs" do + belongs_to :project, BDS.Projects.Project, type: :string + field :post_id_a, :string + field :post_id_b, :string + field :dismissed_at, :integer + end + + def changeset(pair, attrs) do + pair + |> cast(attrs, [:id, :project_id, :post_id_a, :post_id_b, :dismissed_at]) + |> validate_required([:id, :project_id, :post_id_a, :post_id_b, :dismissed_at]) + |> foreign_key_constraint(:project_id) + |> unique_constraint(:post_id_a, name: :dismissed_pairs_idx) + end +end diff --git a/lib/bds/embeddings/key.ex b/lib/bds/embeddings/key.ex new file mode 100644 index 0000000..547d5c4 --- /dev/null +++ b/lib/bds/embeddings/key.ex @@ -0,0 +1,25 @@ +defmodule BDS.Embeddings.Key do + @moduledoc false + + use Ecto.Schema + import Ecto.Changeset + + @primary_key {:label, :integer, autogenerate: false} + @foreign_key_type :string + + schema "embedding_keys" do + belongs_to :post, BDS.Posts.Post, type: :string + belongs_to :project, BDS.Projects.Project, type: :string + + field :content_hash, :string + field :vector, :string + end + + def changeset(key, attrs) do + key + |> cast(attrs, [:label, :post_id, :project_id, :content_hash, :vector]) + |> validate_required([:label, :post_id, :project_id, :content_hash, :vector]) + |> foreign_key_constraint(:post_id) + |> foreign_key_constraint(:project_id) + end +end diff --git a/lib/bds/post_links.ex b/lib/bds/post_links.ex new file mode 100644 index 0000000..8198c9c --- /dev/null +++ b/lib/bds/post_links.ex @@ -0,0 +1,146 @@ +defmodule BDS.PostLinks do + @moduledoc false + + import Ecto.Query + + alias BDS.Posts.Link + alias BDS.Posts.Post + alias BDS.Projects + alias BDS.Repo + + def sync_post_links(%Post{} = post) do + links = + post + |> post_body() + |> extract_links() + |> Enum.map(&resolve_post_link(post.project_id, &1)) + |> Enum.reject(&is_nil/1) + |> Enum.uniq_by(fn %{target_post_id: target_post_id, link_text: link_text} -> + {target_post_id, link_text} + end) + + Repo.transaction(fn -> + Repo.delete_all(from link in Link, where: link.source_post_id == ^post.id) + + now = System.system_time(:second) + + Enum.each(links, fn %{target_post_id: target_post_id, link_text: link_text} -> + %Link{} + |> Link.changeset(%{ + id: Ecto.UUID.generate(), + source_post_id: post.id, + target_post_id: target_post_id, + link_text: link_text, + created_at: now + }) + |> Repo.insert!() + end) + end) + + :ok + end + + def delete_post_links(post_id) when is_binary(post_id) do + Repo.delete_all( + from link in Link, + where: link.source_post_id == ^post_id or link.target_post_id == ^post_id + ) + + :ok + end + + def list_outgoing_links(post_id) when is_binary(post_id) do + Repo.all(from link in Link, where: link.source_post_id == ^post_id, order_by: [asc: link.created_at]) + end + + def list_incoming_links(post_id) when is_binary(post_id) do + Repo.all(from link in Link, where: link.target_post_id == ^post_id, order_by: [asc: link.created_at]) + end + + defp post_body(%Post{content: content}) when is_binary(content), do: content + + defp post_body(%Post{project_id: project_id, file_path: file_path}) do + if file_path in [nil, ""] do + "" + else + project = Projects.get_project!(project_id) + full_path = Path.join(Projects.project_data_dir(project), file_path) + + case File.read(full_path) do + {:ok, contents} -> + case String.split(contents, "\n---\n", parts: 2) do + [_frontmatter, body] -> String.trim_trailing(body, "\n") + _parts -> contents + end + + {:error, _reason} -> + "" + end + end + end + + defp extract_links(body) when is_binary(body) do + markdown_links = + Regex.scan(~r/\[([^\]]+)\]\(([^)]+)\)/, body) + |> Enum.map(fn [_full, link_text, href] -> %{link_text: normalize_link_text(link_text), href: href} end) + + html_links = + Regex.scan(~r/]*href=["']([^"']+)["'][^>]*>(.*?)<\/a>/is, body) + |> Enum.map(fn [_full, href, link_text] -> %{link_text: normalize_link_text(link_text), href: href} end) + + markdown_links ++ html_links + end + + defp resolve_post_link(project_id, %{href: href, link_text: link_text}) do + path = + href + |> to_string() + |> String.trim() + |> URI.parse() + |> Map.get(:path) + + with path when is_binary(path) <- path, + slug when is_binary(slug) <- extract_slug(path), + %Post{id: target_post_id} <- Repo.get_by(Post, project_id: project_id, slug: slug) do + %{target_post_id: target_post_id, link_text: link_text} + else + _ -> nil + end + end + + defp extract_slug(path) do + segments = path |> String.split("/", trim: true) + + case segments do + [year, month, day, slug] -> + if numeric_year?(year) and numeric_month_or_day?(month) and numeric_month_or_day?(day), + do: slug, + else: nil + + [language, year, month, day, slug] -> + if language_code?(language) and numeric_year?(year) and numeric_month_or_day?(month) and + numeric_month_or_day?(day), + do: slug, + else: nil + + [slug] -> slug + [language, slug] -> if(language_code?(language), do: slug, else: nil) + _other -> nil + end + end + + defp numeric_year?(value), do: String.match?(value, ~r/^\d{4}$/) + defp numeric_month_or_day?(value), do: String.match?(value, ~r/^\d{2}$/) + defp language_code?(value), do: String.match?(value, ~r/^[a-z]{2}$/) + + defp normalize_link_text(value) do + value + |> to_string() + |> String.replace(~r/<[^>]+>/, "") + |> String.trim() + |> case do + "" -> nil + trimmed -> trimmed + end + end +end diff --git a/lib/bds/posts.ex b/lib/bds/posts.ex index 1eb3d93..8971cca 100644 --- a/lib/bds/posts.ex +++ b/lib/bds/posts.ex @@ -4,7 +4,9 @@ defmodule BDS.Posts do import Ecto.Query alias BDS.Frontmatter + alias BDS.Embeddings alias BDS.Metadata + alias BDS.PostLinks alias BDS.Posts.Post alias BDS.Posts.Translation alias BDS.Projects @@ -47,6 +49,7 @@ defmodule BDS.Posts do |> Repo.insert() |> case do {:ok, post} -> + :ok = Embeddings.sync_post(post) :ok = Search.sync_post(post) {:ok, post} @@ -75,6 +78,8 @@ defmodule BDS.Posts do |> Repo.update() |> case do {:ok, updated_post} -> + :ok = Embeddings.sync_post(updated_post) + :ok = PostLinks.sync_post_links(updated_post) :ok = Search.sync_post(updated_post) {:ok, updated_post} @@ -119,7 +124,9 @@ defmodule BDS.Posts do |> Repo.update() |> case do {:ok, updated_post} -> + :ok = Embeddings.sync_post(updated_post) :ok = publish_post_translations(updated_post) + :ok = PostLinks.sync_post_links(updated_post) :ok = Search.sync_post(updated_post) {:ok, updated_post} @@ -149,6 +156,8 @@ defmodule BDS.Posts do %Post{} = post -> delete_post_file(post) + :ok = Embeddings.remove_post(post.id) + :ok = PostLinks.delete_post_links(post.id) Repo.delete!(post) :ok = Search.delete_post(post.id) {:ok, :deleted} diff --git a/lib/bds/posts/link.ex b/lib/bds/posts/link.ex new file mode 100644 index 0000000..9445216 --- /dev/null +++ b/lib/bds/posts/link.ex @@ -0,0 +1,25 @@ +defmodule BDS.Posts.Link do + @moduledoc false + + use Ecto.Schema + import Ecto.Changeset + + @primary_key {:id, :string, autogenerate: false} + @foreign_key_type :string + + schema "post_links" do + belongs_to :source_post, BDS.Posts.Post, foreign_key: :source_post_id, references: :id, type: :string + belongs_to :target_post, BDS.Posts.Post, foreign_key: :target_post_id, references: :id, type: :string + + field :link_text, :string + field :created_at, :integer + end + + def changeset(link, attrs) do + link + |> cast(attrs, [:id, :source_post_id, :target_post_id, :link_text, :created_at]) + |> validate_required([:id, :source_post_id, :target_post_id, :created_at]) + |> foreign_key_constraint(:source_post_id) + |> foreign_key_constraint(:target_post_id) + end +end diff --git a/lib/bds/rendering.ex b/lib/bds/rendering.ex index 7a9d0ae..467f10d 100644 --- a/lib/bds/rendering.ex +++ b/lib/bds/rendering.ex @@ -5,16 +5,17 @@ defmodule BDS.Rendering do alias BDS.Frontmatter alias BDS.Media.Media, as: MediaAsset - alias BDS.Rendering.FileSystem alias BDS.Menu alias BDS.Metadata + alias BDS.PostLinks alias BDS.Projects - alias BDS.Rendering.Filters alias BDS.I18n + alias BDS.Rendering.FileSystem + alias BDS.Rendering.Filters alias BDS.Repo - alias BDS.Tags.Tag alias BDS.Posts.Post alias BDS.Posts.Translation + alias BDS.Tags.Tag alias BDS.Templates.Template def render_post_page(project_id, template_slug, assigns) @@ -119,10 +120,14 @@ defmodule BDS.Rendering do main_language = metadata.main_language || language post_record = load_post_record(assigns) + canonical_post = canonical_post_record(post_record) + post_id = canonical_post_id(post_record, assigns) post_categories = Map.get(post_record || %{}, :categories, []) || [] post_tags = Map.get(post_record || %{}, :tags, []) || [] canonical_post_paths = canonical_post_path_by_slug(project_id, main_language) canonical_media_paths = canonical_media_path_by_source_path(project_id) + incoming_links = link_contexts(project_id, post_id, :incoming, main_language) + outgoing_links = link_contexts(project_id, post_id, :outgoing, main_language) %{ language: language, @@ -141,18 +146,18 @@ defmodule BDS.Rendering do pico_stylesheet_href: default_pico_stylesheet_href(), html_theme_attribute: html_theme_attribute(metadata.pico_theme), blog_languages: blog_languages(metadata, language), - alternate_links: [], + alternate_links: alternate_links(canonical_post, project_id, main_language), menu_items: menu_items(project_id), calendar_initial_year: calendar_initial_year(post_record), calendar_initial_month: calendar_initial_month(post_record), post_categories: post_categories, post_tags: post_tags, tag_color_by_name: tag_color_by_name(project_id), - backlinks: [], + backlinks: backlinks(incoming_links), canonical_post_path_by_slug: canonical_post_paths, canonical_media_path_by_source_path: canonical_media_paths, post_data_json_by_id: post_data_json(assigns, post_record), - post: build_post_context(assigns, post_record) + post: build_post_context(assigns, post_record, incoming_links, outgoing_links) } end @@ -171,6 +176,10 @@ defmodule BDS.Rendering do canonical_post_paths = canonical_post_path_by_slug(project_id, main_language) canonical_media_paths = canonical_media_path_by_source_path(project_id) + day_blocks = build_day_blocks(posts) + min_date = min_date(posts) + max_date = max_date(posts) + normalized_archive_context = normalize_archive_context(archive_context) %{ language: language, @@ -189,10 +198,10 @@ defmodule BDS.Rendering do menu_items: menu_items(project_id), calendar_initial_year: calendar_initial_year_from_posts(posts), calendar_initial_month: calendar_initial_month_from_posts(posts), - archive_context: normalize_archive_context(archive_context), - show_archive_range_heading: false, - min_date: nil, - max_date: nil, + archive_context: normalized_archive_context, + show_archive_range_heading: show_archive_range_heading?(normalized_archive_context, day_blocks), + min_date: min_date, + max_date: max_date, is_list_page: true, is_first_page: pagination.current_page <= 1, is_last_page: pagination.current_page >= pagination.total_pages, @@ -208,14 +217,7 @@ defmodule BDS.Rendering do canonical_media_path_by_source_path: canonical_media_paths, post_data_json_by_id: Enum.into(posts, %{}, fn post -> {post.id, post_data_json_value(post)} end), - day_blocks: [ - %{ - date_label: "", - show_date_marker: false, - show_separator: false, - posts: posts - } - ] + day_blocks: day_blocks } end @@ -332,12 +334,23 @@ defmodule BDS.Rendering do end end + defp canonical_post_record(%Translation{translation_for: post_id}), do: Repo.get(Post, post_id) + defp canonical_post_record(%Post{} = post), do: post + defp canonical_post_record(_other), do: nil + + defp canonical_post_id(%Translation{translation_for: post_id}, _assigns), do: post_id + defp canonical_post_id(%Post{id: post_id}, _assigns), do: post_id + defp canonical_post_id(_post_record, assigns), do: Map.get(assigns, :id, Map.get(assigns, "id")) + defp post_data_json(assigns, post_record) do id = Map.get(assigns, :id, Map.get(assigns, "id")) if is_binary(id) do + incoming_links = link_contexts(Map.get(post_record || %{}, :project_id), canonical_post_id(post_record, assigns), :incoming, Map.get(post_record || %{}, :language)) + outgoing_links = link_contexts(Map.get(post_record || %{}, :project_id), canonical_post_id(post_record, assigns), :outgoing, Map.get(post_record || %{}, :language)) + %{ - id => post_data_json_value(build_post_context(assigns, post_record)) + id => post_data_json_value(build_post_context(assigns, post_record, incoming_links, outgoing_links)) } else %{} @@ -387,6 +400,58 @@ defmodule BDS.Rendering do end) end + defp alternate_links(nil, _project_id, _main_language), do: [] + + defp alternate_links(%Post{} = post, project_id, main_language) do + translations = + Repo.all( + from translation in Translation, + where: + translation.project_id == ^project_id and + translation.translation_for == ^post.id and + translation.status == :published, + order_by: [asc: translation.language] + ) + + [%{href: post_path(post, nil), hreflang: normalize_language(post.language, main_language)}] ++ + Enum.map(translations, fn translation -> + %{href: post_path(post, translation.language, main_language), hreflang: translation.language} + end) + end + + defp backlinks(incoming_links) do + Enum.map(incoming_links, fn link -> + %{path: link.href, display_slug: link.display_slug, title: link.title} + end) + end + + defp link_contexts(_project_id, nil, _direction, _main_language), do: [] + + defp link_contexts(project_id, post_id, :incoming, main_language) do + PostLinks.list_incoming_links(post_id) + |> Enum.map(&link_context(project_id, &1, :incoming, main_language)) + |> Enum.reject(&is_nil/1) + end + + defp link_contexts(project_id, post_id, :outgoing, main_language) do + PostLinks.list_outgoing_links(post_id) + |> Enum.map(&link_context(project_id, &1, :outgoing, main_language)) + |> Enum.reject(&is_nil/1) + end + + defp link_context(_project_id, link, direction, main_language) do + linked_post_id = + case direction do + :incoming -> link.source_post_id + :outgoing -> link.target_post_id + end + + case Repo.get(Post, linked_post_id) do + nil -> nil + linked_post -> %{href: post_path(linked_post, nil), title: linked_post.title, display_slug: linked_post.slug, language: normalize_language(linked_post.language, main_language)} + end + end + defp canonical_media_path_by_source_path(project_id) do Repo.all(from media in MediaAsset, where: media.project_id == ^project_id) |> Enum.reduce(%{}, fn media, acc -> @@ -407,7 +472,7 @@ defmodule BDS.Rendering do defp post_path(post, language_prefix) when is_binary(language_prefix) and language_prefix != "" do - Path.join([String.trim_leading(language_prefix, "/"), post_path(post, nil)]) + language_prefix <> post_path(post, nil) end defp post_path(post, nil) do @@ -444,20 +509,26 @@ defmodule BDS.Rendering do ), excerpt: Map.get(post, :excerpt, Map.get(post, "excerpt", Map.get(post_record || %{}, :excerpt))), - author: Map.get(post_record || %{}, :author), + author: Map.get(post, :author, Map.get(post, "author", Map.get(post_record || %{}, :author))), language: Map.get( post, :language, Map.get(post, "language", Map.get(post_record || %{}, :language)) ), - published_at: Map.get(post_record || %{}, :published_at), - created_at: Map.get(post_record || %{}, :created_at), - updated_at: Map.get(post_record || %{}, :updated_at), - tags: Map.get(post_record || %{}, :tags, []) || [], - categories: Map.get(post_record || %{}, :categories, []) || [], - template_slug: Map.get(post_record || %{}, :template_slug), - do_not_translate: Map.get(post_record || %{}, :do_not_translate, false), + published_at: + Map.get(post, :published_at, Map.get(post, "published_at", Map.get(post_record || %{}, :published_at))), + created_at: + Map.get(post, :created_at, Map.get(post, "created_at", Map.get(post_record || %{}, :created_at))), + updated_at: + Map.get(post, :updated_at, Map.get(post, "updated_at", Map.get(post_record || %{}, :updated_at))), + tags: Map.get(post, :tags, Map.get(post, "tags", Map.get(post_record || %{}, :tags, []))) || [], + categories: + Map.get(post, :categories, Map.get(post, "categories", Map.get(post_record || %{}, :categories, []))) || [], + template_slug: + Map.get(post, :template_slug, Map.get(post, "template_slug", Map.get(post_record || %{}, :template_slug))), + do_not_translate: + Map.get(post, :do_not_translate, Map.get(post, "do_not_translate", Map.get(post_record || %{}, :do_not_translate, false))), href: Map.get(post, :href, Map.get(post, "href")), show_title: true, linked_media: [], @@ -467,7 +538,7 @@ defmodule BDS.Rendering do end) end - defp build_post_context(assigns, post_record) do + defp build_post_context(assigns, post_record, incoming_links, outgoing_links) do %{ id: Map.get(assigns, :id, Map.get(assigns, "id")), slug: Map.get(assigns, :slug, Map.get(assigns, "slug")), @@ -500,8 +571,8 @@ defmodule BDS.Rendering do ), do_not_translate: Map.get(post_record || %{}, :do_not_translate, false), linked_media: [], - outgoing_links: [], - incoming_links: [] + outgoing_links: outgoing_links, + incoming_links: incoming_links } end @@ -544,7 +615,56 @@ defmodule BDS.Rendering do end defp normalize_archive_context(nil), do: nil - defp normalize_archive_context(%{} = archive_context), do: archive_context + + defp normalize_archive_context(%{} = archive_context) do + %{ + kind: Map.get(archive_context, :kind, Map.get(archive_context, "kind")), + name: Map.get(archive_context, :name, Map.get(archive_context, "name")), + month: Map.get(archive_context, :month, Map.get(archive_context, "month")), + year: Map.get(archive_context, :year, Map.get(archive_context, "year")), + day: Map.get(archive_context, :day, Map.get(archive_context, "day")) + } + end + + defp build_day_blocks(posts) do + grouped_blocks = + posts + |> Enum.filter(&is_integer(Map.get(&1, :created_at))) + |> Enum.group_by(&DateTime.from_unix!(Map.get(&1, :created_at)) |> DateTime.to_date() |> Date.to_iso8601()) + |> Enum.sort_by(fn {label, _posts} -> label end) + + grouped_blocks + |> Enum.with_index() + |> Enum.map(fn {{date_label, grouped_posts}, index} -> + %{ + date_label: date_label, + show_date_marker: true, + show_separator: index < length(grouped_blocks) - 1, + posts: Enum.sort_by(grouped_posts, &Map.get(&1, :created_at)) + } + end) + |> case do + [] -> [%{date_label: "", show_date_marker: false, show_separator: false, posts: posts}] + blocks -> blocks + end + end + + defp min_date(posts) do + posts + |> Enum.map(&Map.get(&1, :created_at)) + |> Enum.filter(&is_integer/1) + |> Enum.min(fn -> nil end) + end + + defp max_date(posts) do + posts + |> Enum.map(&Map.get(&1, :created_at)) + |> Enum.filter(&is_integer/1) + |> Enum.max(fn -> nil end) + end + + defp show_archive_range_heading?(%{kind: "date"}, _day_blocks), do: true + defp show_archive_range_heading?(_archive_context, _day_blocks), do: false defp html_theme_attribute(nil), do: nil defp html_theme_attribute(""), do: nil @@ -574,4 +694,15 @@ defmodule BDS.Rendering do defp language_prefix(language, main_language) when language == main_language, do: "" defp language_prefix(nil, _main_language), do: "" defp language_prefix(language, _main_language), do: "/#{language}" + + defp normalize_language(nil, fallback), do: fallback + defp normalize_language("", fallback), do: fallback + + defp normalize_language(language, _fallback) do + language + |> to_string() + |> String.downcase() + |> String.split("-", parts: 2) + |> hd() + end end diff --git a/test/bds/embeddings_test.exs b/test/bds/embeddings_test.exs new file mode 100644 index 0000000..a78b2eb --- /dev/null +++ b/test/bds/embeddings_test.exs @@ -0,0 +1,106 @@ +defmodule BDS.EmbeddingsTest do + use ExUnit.Case, async: false + + setup do + :ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo) + + temp_dir = Path.join(System.tmp_dir!(), "bds-embeddings-#{System.unique_integer([:positive])}") + File.mkdir_p!(temp_dir) + + on_exit(fn -> File.rm_rf(temp_dir) end) + + {:ok, project} = BDS.Projects.create_project(%{name: "Embeddings", data_path: temp_dir}) + %{project: project} + end + + test "embeddings index published posts when semantic similarity is enabled and support similarity, duplicates, dismissals, and tag suggestions", + %{project: project} do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true}) + + assert {:ok, alpha} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Space Travel", + content: "space rocket launch orbit mission galaxy", + tags: ["space", "science"], + language: "en" + }) + + assert {:ok, beta} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Rocket Mission", + content: "rocket launch mission orbit space station", + tags: ["space", "mission"], + language: "en" + }) + + assert {:ok, gamma} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Bread Baking", + content: "flour yeast dough oven loaf kitchen", + tags: ["food"], + language: "en" + }) + + assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id) + assert {:ok, beta} = BDS.Posts.publish_post(beta.id) + assert {:ok, gamma} = BDS.Posts.publish_post(gamma.id) + + assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id) + assert Enum.sort(indexed) == Enum.sort([alpha.id, beta.id, gamma.id]) + + assert {:ok, similar} = BDS.Embeddings.find_similar(alpha.id, 2) + assert length(similar) == 2 + assert hd(similar).post_id == beta.id + assert hd(similar).score > List.last(similar).score + + assert {:ok, scores} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id]) + assert scores[beta.id] > scores[gamma.id] + + assert {:ok, suggestions} = BDS.Embeddings.suggest_tags(alpha.id, "rocket orbit mission") + assert "space" in suggestions + + assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id) + assert Enum.any?(duplicates, fn pair -> + MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id]) + end) + + assert {:ok, dismissal} = BDS.Embeddings.dismiss_duplicate_pair(alpha.id, beta.id) + assert dismissal.project_id == project.id + + assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id) + + refute Enum.any?(filtered_duplicates, fn pair -> + MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id]) + end) + + assert {:ok, alpha} = BDS.Posts.update_post(alpha.id, %{content: "kitchen flour dough loaf"}) + assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id) + + assert {:ok, updated_scores} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id]) + assert updated_scores[gamma.id] > updated_scores[beta.id] + + assert {:ok, :deleted} = BDS.Posts.delete_post(gamma.id) + + assert {:ok, after_delete} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id]) + refute Map.has_key?(after_delete, gamma.id) + end + + test "embedding queries are gated off when semantic similarity is disabled", %{project: project} do + assert {:ok, post} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Disabled", + content: "space rocket mission" + }) + + assert {:ok, post} = BDS.Posts.publish_post(post.id) + + assert {:ok, []} = BDS.Embeddings.find_similar(post.id, 5) + assert {:ok, []} = BDS.Embeddings.find_duplicates(project.id) + assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id]) + end +end diff --git a/test/bds/post_links_test.exs b/test/bds/post_links_test.exs new file mode 100644 index 0000000..3e4d36d --- /dev/null +++ b/test/bds/post_links_test.exs @@ -0,0 +1,98 @@ +defmodule BDS.PostLinksTest do + use ExUnit.Case, async: false + + alias BDS.Repo + + setup do + :ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo) + + temp_dir = Path.join(System.tmp_dir!(), "bds-post-links-#{System.unique_integer([:positive])}") + File.mkdir_p!(temp_dir) + + on_exit(fn -> File.rm_rf(temp_dir) end) + + {:ok, project} = BDS.Projects.create_project(%{name: "Links", data_path: temp_dir}) + %{project: project, temp_dir: temp_dir} + end + + test "publishing and updating posts sync outgoing post links and deleting a post removes them", %{ + project: project + } do + assert {:ok, target} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Target Post", + content: "target body" + }) + + assert {:ok, target} = BDS.Posts.publish_post(target.id) + + target_href = canonical_post_href(target) + + assert {:ok, source} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Source Post", + content: "See [Target](#{target_href})" + }) + + assert {:ok, source} = BDS.Posts.publish_post(source.id) + + assert post_links() == [ + %{ + source_post_id: source.id, + target_post_id: target.id, + link_text: "Target" + } + ] + + assert {:ok, source} = + BDS.Posts.update_post(source.id, %{ + content: "A revised body without a post reference" + }) + + assert source.status == :draft + assert post_links() == [] + + assert {:ok, source} = + BDS.Posts.update_post(source.id, %{ + content: "Now [Target Again](#{target_href})" + }) + + assert {:ok, source} = BDS.Posts.publish_post(source.id) + + assert post_links() == [ + %{ + source_post_id: source.id, + target_post_id: target.id, + link_text: "Target Again" + } + ] + + assert {:ok, :deleted} = BDS.Posts.delete_post(target.id) + assert post_links() == [] + end + + defp canonical_post_href(post) do + datetime = DateTime.from_unix!(post.created_at) + + Path.join([ + "", + Integer.to_string(datetime.year), + String.pad_leading(Integer.to_string(datetime.month), 2, "0"), + String.pad_leading(Integer.to_string(datetime.day), 2, "0"), + post.slug, + "" + ]) + end + + defp post_links do + Repo.query!( + "SELECT source_post_id, target_post_id, link_text FROM post_links ORDER BY source_post_id, target_post_id", + [] + ).rows + |> Enum.map(fn [source_post_id, target_post_id, link_text] -> + %{source_post_id: source_post_id, target_post_id: target_post_id, link_text: link_text} + end) + end +end diff --git a/test/bds/rendering_test.exs b/test/bds/rendering_test.exs index 59f4560..3369428 100644 --- a/test/bds/rendering_test.exs +++ b/test/bds/rendering_test.exs @@ -72,6 +72,82 @@ defmodule BDS.RenderingTest do assert rendered =~ "|2|1|#{published_template.slug}|true" end + test "render_post_page exposes alternate links, backlinks, and post link context", %{ + project: project + } do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{ + main_language: "en", + blog_languages: ["en", "de"] + }) + + assert {:ok, template} = + BDS.Templates.create_template(%{ + project_id: project.id, + title: "Render Link Context", + kind: :post, + content: + "alts={% for alt in alternate_links %}[{{ alt.hreflang }}={{ alt.href }}]{% endfor %}|backlinks={% for backlink in backlinks %}[{{ backlink.display_slug }}={{ backlink.title }}={{ backlink.path }}]{% endfor %}|outgoing={% for link in post.outgoing_links %}[{{ link.display_slug }}={{ link.title }}={{ link.href }}]{% endfor %}|incoming={% for link in post.incoming_links %}[{{ link.display_slug }}={{ link.title }}={{ link.href }}]{% endfor %}" + }) + + assert {:ok, published_template} = BDS.Templates.publish_template(template.id) + + assert {:ok, target} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Linked Target", + content: "target body", + language: "en", + template_slug: published_template.slug + }) + + assert {:ok, _translation} = + BDS.Posts.upsert_post_translation(target.id, "de", %{ + title: "Verlinktes Ziel", + content: "zieltext" + }) + + assert {:ok, target} = BDS.Posts.publish_post(target.id) + + assert {:ok, source} = + BDS.Posts.create_post(%{ + project_id: project.id, + title: "Linking Source", + content: "See [Target Link](#{canonical_post_href(target)})", + language: "en", + template_slug: published_template.slug + }) + + assert {:ok, source} = BDS.Posts.publish_post(source.id) + + assert {:ok, rendered_target} = + Rendering.render_post_page(project.id, published_template.slug, %{ + id: target.id, + title: target.title, + content: target.content || "", + slug: target.slug, + language: "en", + template_slug: published_template.slug + }) + + assert rendered_target =~ "alts=[en=#{canonical_post_href(target)}]" + assert rendered_target =~ "[de=/de#{canonical_post_href(target)}]" + assert rendered_target =~ "backlinks=[linking-source=Linking Source=#{canonical_post_href(source)}]" + assert rendered_target =~ "incoming=[linking-source=Linking Source=#{canonical_post_href(source)}]" + + assert {:ok, rendered_source} = + Rendering.render_post_page(project.id, published_template.slug, %{ + id: source.id, + title: source.title, + content: source.content || "", + slug: source.slug, + language: "en", + template_slug: published_template.slug + }) + + assert rendered_source =~ "outgoing=[linked-target=Linked Target=#{canonical_post_href(target)}]" + end + test "render_list_page exposes pagination and render_not_found_page localizes default copy", %{ project: project } do @@ -149,4 +225,95 @@ defmodule BDS.RenderingTest do assert published_list_template.kind == :list end + + test "render_list_page groups posts into day blocks and exposes archive range fields", %{ + project: project + } do + assert {:ok, _metadata} = + BDS.Metadata.update_project_metadata(project.id, %{ + main_language: "en", + blog_languages: ["en"] + }) + + assert {:ok, list_template} = + BDS.Templates.create_template(%{ + project_id: project.id, + title: "Render Day Blocks", + kind: :list, + content: + "range={{ min_date }}-{{ max_date }}|heading={{ show_archive_range_heading }}|blocks={% for block in day_blocks %}[{{ block.date_label }}:{{ block.posts.size }}:{{ block.show_date_marker }}]{% endfor %}|archive={{ archive_context.kind }}:{{ archive_context.year }}:{{ archive_context.month }}" + }) + + assert {:ok, published_list_template} = BDS.Templates.publish_template(list_template.id) + + BDS.Repo.update_all( + from(template in BDS.Templates.Template, + where: + template.project_id == ^project.id and template.kind == :list and + template.id != ^published_list_template.id + ), + set: [enabled: false] + ) + + first_day = 1_711_843_200 + second_day = first_day + 86_400 + + posts = [ + %{ + id: "first", + slug: "first", + title: "First", + excerpt: "one", + language: "en", + created_at: first_day, + updated_at: first_day, + published_at: first_day, + tags: [], + categories: [], + href: "/2024/03/31/first/" + }, + %{ + id: "second", + slug: "second", + title: "Second", + excerpt: "two", + language: "en", + created_at: second_day, + updated_at: second_day, + published_at: second_day, + tags: [], + categories: [], + href: "/2024/04/01/second/" + } + ] + + assert {:ok, rendered} = + Rendering.render_list_page(project.id, %{ + language: "en", + page_title: "Archive", + posts: posts, + archive_context: %{kind: "date", year: 2024, month: 4}, + pagination: %{ + current_page: 1, + total_pages: 1, + total_items: 2, + items_per_page: 10, + has_prev_page: false, + prev_page_href: nil, + has_next_page: false, + next_page_href: nil + } + }) + + assert rendered =~ "heading=true" + assert rendered =~ "blocks=[2024-03-31:1:true][2024-04-01:1:true]" + assert rendered =~ "archive=date:2024:4" + assert rendered =~ "range=1711843200-1711929600" + end + + defp canonical_post_href(post) do + datetime = DateTime.from_unix!(post.created_at) + + "/#{datetime.year}/#{String.pad_leading(Integer.to_string(datetime.month), 2, "0")}/#{String.pad_leading(Integer.to_string(datetime.day), 2, "0")}/#{post.slug}/" + end end