feat: more completeness of spec and start at embedding

This commit is contained in:
2026-04-24 07:57:28 +02:00
parent d04117abdc
commit 88f966dae9
10 changed files with 1105 additions and 32 deletions

342
lib/bds/embeddings.ex Normal file
View File

@@ -0,0 +1,342 @@
defmodule BDS.Embeddings do
@moduledoc false
import Ecto.Query
alias BDS.Embeddings.DismissedDuplicatePair
alias BDS.Embeddings.Key
alias BDS.Metadata
alias BDS.Posts.Post
alias BDS.Projects
alias BDS.Repo
@dimensions 384
@duplicate_threshold 0.5
@model_id "Xenova/multilingual-e5-small"
def model_id, do: @model_id
def dimensions, do: @dimensions
def sync_post(%Post{} = post) do
if enabled_for_project?(post.project_id) do
body = resolve_post_body(post)
raw_text = compose_embedding_source(post.title, body)
content_hash = hash_text(raw_text)
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
%Key{content_hash: ^content_hash} ->
:ok
existing_key ->
label = existing_key_label(existing_key) || next_label()
vector = vectorize(raw_text, post.language)
(existing_key || %Key{})
|> Key.changeset(%{
label: label,
post_id: post.id,
project_id: post.project_id,
content_hash: content_hash,
vector: Jason.encode!(vector)
})
|> Repo.insert_or_update()
:ok
end
else
:ok
end
end
def sync_post(post_id) when is_binary(post_id) do
case Repo.get(Post, post_id) do
nil -> :ok
post -> sync_post(post)
end
end
def remove_post(post_id) when is_binary(post_id) do
Repo.delete_all(from key in Key, where: key.post_id == ^post_id)
:ok
end
def index_unindexed(project_id) when is_binary(project_id) do
if enabled_for_project?(project_id) do
posts =
Repo.all(from post in Post, where: post.project_id == ^project_id, order_by: [asc: post.created_at, asc: post.slug])
Enum.each(posts, fn post ->
body = resolve_post_body(post)
content_hash = hash_text(compose_embedding_source(post.title, body))
case Repo.get_by(Key, post_id: post.id, project_id: project_id) do
%Key{content_hash: ^content_hash} -> :ok
_other ->
:ok = sync_post(%{post | content: if(post.content in [nil, ""], do: body, else: post.content)})
end
end)
indexed = Repo.all(from key in Key, where: key.project_id == ^project_id, select: key.post_id)
{:ok, indexed}
else
{:ok, []}
end
end
def find_similar(post_id, limit \\ 5) when is_binary(post_id) and is_integer(limit) do
case source_post_and_vector(post_id) do
{:disabled, _project_id} -> {:ok, []}
{:error, :not_found} -> {:ok, []}
{:ok, post, source_vector} ->
similar =
Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id != ^post.id)
|> Enum.map(fn key -> %{post_id: key.post_id, score: cosine_similarity(source_vector, decode_vector(key.vector))} end)
|> Enum.sort_by(& &1.score, :desc)
|> Enum.take(max(limit, 0))
{:ok, similar}
end
end
def compute_similarities(source_post_id, target_post_ids)
when is_binary(source_post_id) and is_list(target_post_ids) do
case source_post_and_vector(source_post_id) do
{:disabled, _project_id} -> {:ok, %{}}
{:error, :not_found} -> {:ok, %{}}
{:ok, post, source_vector} ->
target_ids = Enum.uniq(target_post_ids)
scores =
Repo.all(from key in Key, where: key.project_id == ^post.project_id and key.post_id in ^target_ids)
|> Enum.reduce(%{}, fn key, acc ->
if key.post_id == source_post_id do
acc
else
Map.put(acc, key.post_id, cosine_similarity(source_vector, decode_vector(key.vector)))
end
end)
{:ok, scores}
end
end
def suggest_tags(post_id, _input_text) when is_binary(post_id) do
with {:ok, _post} <- fetch_post(post_id),
{:ok, similar} <- find_similar(post_id, 10) do
suggestions =
Repo.all(from other in Post, where: other.id in ^Enum.map(similar, & &1.post_id))
|> Map.new(&{&1.id, &1})
|> then(fn posts_by_id ->
Enum.reduce(similar, %{}, fn %{post_id: similar_post_id, score: score}, acc ->
case Map.get(posts_by_id, similar_post_id) do
nil -> acc
similar_post ->
Enum.reduce(similar_post.tags || [], acc, fn tag, tag_acc ->
Map.update(tag_acc, tag, score, &(&1 + score))
end)
end
end)
end)
|> Enum.sort_by(fn {_tag, score} -> score end, :desc)
|> Enum.take(5)
|> Enum.map(fn {tag, _score} -> tag end)
{:ok, suggestions}
else
{:error, :not_found} -> {:ok, []}
{:disabled, _project_id} -> {:ok, []}
end
end
def find_duplicates(project_id) when is_binary(project_id) do
if enabled_for_project?(project_id) do
dismissed = dismissed_pair_keys(project_id)
keys = Repo.all(from key in Key, where: key.project_id == ^project_id, order_by: [asc: key.post_id])
duplicates =
for left <- keys,
right <- keys,
left.post_id < right.post_id,
pair_key(left.post_id, right.post_id) not in dismissed,
similarity = cosine_similarity(decode_vector(left.vector), decode_vector(right.vector)),
similarity >= @duplicate_threshold do
%{
post_id_a: left.post_id,
post_id_b: right.post_id,
score: similarity
}
end
|> Enum.sort_by(& &1.score, :desc)
{:ok, duplicates}
else
{:ok, []}
end
end
def dismiss_duplicate_pair(post_id_a, post_id_b)
when is_binary(post_id_a) and is_binary(post_id_b) do
with {:ok, post_a} <- fetch_post(post_id_a),
{:ok, post_b} <- fetch_post(post_id_b),
true <- post_a.project_id == post_b.project_id do
{sorted_a, sorted_b} = sort_pair(post_id_a, post_id_b)
pair =
Repo.get_by(DismissedDuplicatePair,
project_id: post_a.project_id,
post_id_a: sorted_a,
post_id_b: sorted_b
) || %DismissedDuplicatePair{}
saved_pair =
pair
|> DismissedDuplicatePair.changeset(%{
id: pair.id || Ecto.UUID.generate(),
project_id: post_a.project_id,
post_id_a: sorted_a,
post_id_b: sorted_b,
dismissed_at: System.system_time(:second)
})
|> Repo.insert_or_update!()
{:ok, saved_pair}
else
_ -> {:error, :not_found}
end
end
defp source_post_and_vector(post_id) do
with {:ok, post} <- fetch_post(post_id) do
if enabled_for_project?(post.project_id) do
:ok = ensure_key(post)
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
nil -> {:ok, post, []}
key -> {:ok, post, decode_vector(key.vector)}
end
else
{:disabled, post.project_id}
end
end
end
defp ensure_key(%Post{} = post) do
case Repo.get_by(Key, post_id: post.id, project_id: post.project_id) do
nil -> sync_post(post)
_key -> :ok
end
end
defp fetch_post(post_id) do
case Repo.get(Post, post_id) do
nil -> {:error, :not_found}
post -> {:ok, post}
end
end
defp enabled_for_project?(project_id) do
case Metadata.get_project_metadata(project_id) do
{:ok, metadata} -> metadata.semantic_similarity_enabled == true
_other -> false
end
end
defp existing_key_label(nil), do: nil
defp existing_key_label(%Key{label: label}), do: label
defp next_label do
Repo.one(from key in Key, select: max(key.label))
|> case do
nil -> 1
value -> value + 1
end
end
defp resolve_post_body(%Post{content: content}) when is_binary(content) and content != "", do: content
defp resolve_post_body(%Post{project_id: project_id, file_path: file_path}) do
if file_path in [nil, ""] do
""
else
project = Projects.get_project!(project_id)
full_path = Path.join(Projects.project_data_dir(project), file_path)
case File.read(full_path) do
{:ok, contents} ->
case String.split(contents, "\n---\n", parts: 2) do
[_frontmatter, body] -> String.trim_trailing(body, "\n")
_parts -> contents
end
{:error, _reason} ->
""
end
end
end
defp compose_embedding_source(title, content), do: "#{title || ""}\n\n#{content || ""}"
defp hash_text(text), do: :crypto.hash(:sha256, text) |> Base.encode16(case: :lower)
defp vectorize(text, language) do
stemmed = BDS.Search.stem(text, language)
tokens = tokenize(stemmed)
bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::"))
weighted_tokens = tokens ++ bigrams
vector_array = :array.new(@dimensions, default: 0.0)
vector =
Enum.reduce(weighted_tokens, vector_array, fn token, acc ->
index = :erlang.phash2(token, @dimensions)
:array.set(index, :array.get(index, acc) + 1.0, acc)
end)
|> :array.to_list()
normalize(vector)
end
defp tokenize(text) do
Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text))
|> List.flatten()
end
defp normalize(vector) do
norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end))
if norm == 0.0 do
vector
else
Enum.map(vector, &(&1 / norm))
end
end
defp decode_vector(nil), do: []
defp decode_vector(vector), do: Jason.decode!(vector)
defp cosine_similarity([], _other), do: 0.0
defp cosine_similarity(_vector, []), do: 0.0
defp cosine_similarity(left, right) do
Enum.zip(left, right)
|> Enum.reduce(0.0, fn {left_value, right_value}, acc -> acc + left_value * right_value end)
|> max(0.0)
end
defp dismissed_pair_keys(project_id) do
Repo.all(
from pair in DismissedDuplicatePair,
where: pair.project_id == ^project_id,
select: {pair.post_id_a, pair.post_id_b}
)
|> MapSet.new(fn {post_id_a, post_id_b} -> pair_key(post_id_a, post_id_b) end)
end
defp pair_key(post_id_a, post_id_b) do
{sorted_a, sorted_b} = sort_pair(post_id_a, post_id_b)
"#{sorted_a}::#{sorted_b}"
end
defp sort_pair(post_id_a, post_id_b) when post_id_a <= post_id_b, do: {post_id_a, post_id_b}
defp sort_pair(post_id_a, post_id_b), do: {post_id_b, post_id_a}
end

View File

@@ -0,0 +1,24 @@
defmodule BDS.Embeddings.DismissedDuplicatePair do
@moduledoc false
use Ecto.Schema
import Ecto.Changeset
@primary_key {:id, :string, autogenerate: false}
@foreign_key_type :string
schema "dismissed_duplicate_pairs" do
belongs_to :project, BDS.Projects.Project, type: :string
field :post_id_a, :string
field :post_id_b, :string
field :dismissed_at, :integer
end
def changeset(pair, attrs) do
pair
|> cast(attrs, [:id, :project_id, :post_id_a, :post_id_b, :dismissed_at])
|> validate_required([:id, :project_id, :post_id_a, :post_id_b, :dismissed_at])
|> foreign_key_constraint(:project_id)
|> unique_constraint(:post_id_a, name: :dismissed_pairs_idx)
end
end

25
lib/bds/embeddings/key.ex Normal file
View File

@@ -0,0 +1,25 @@
defmodule BDS.Embeddings.Key do
@moduledoc false
use Ecto.Schema
import Ecto.Changeset
@primary_key {:label, :integer, autogenerate: false}
@foreign_key_type :string
schema "embedding_keys" do
belongs_to :post, BDS.Posts.Post, type: :string
belongs_to :project, BDS.Projects.Project, type: :string
field :content_hash, :string
field :vector, :string
end
def changeset(key, attrs) do
key
|> cast(attrs, [:label, :post_id, :project_id, :content_hash, :vector])
|> validate_required([:label, :post_id, :project_id, :content_hash, :vector])
|> foreign_key_constraint(:post_id)
|> foreign_key_constraint(:project_id)
end
end

146
lib/bds/post_links.ex Normal file
View File

@@ -0,0 +1,146 @@
defmodule BDS.PostLinks do
@moduledoc false
import Ecto.Query
alias BDS.Posts.Link
alias BDS.Posts.Post
alias BDS.Projects
alias BDS.Repo
def sync_post_links(%Post{} = post) do
links =
post
|> post_body()
|> extract_links()
|> Enum.map(&resolve_post_link(post.project_id, &1))
|> Enum.reject(&is_nil/1)
|> Enum.uniq_by(fn %{target_post_id: target_post_id, link_text: link_text} ->
{target_post_id, link_text}
end)
Repo.transaction(fn ->
Repo.delete_all(from link in Link, where: link.source_post_id == ^post.id)
now = System.system_time(:second)
Enum.each(links, fn %{target_post_id: target_post_id, link_text: link_text} ->
%Link{}
|> Link.changeset(%{
id: Ecto.UUID.generate(),
source_post_id: post.id,
target_post_id: target_post_id,
link_text: link_text,
created_at: now
})
|> Repo.insert!()
end)
end)
:ok
end
def delete_post_links(post_id) when is_binary(post_id) do
Repo.delete_all(
from link in Link,
where: link.source_post_id == ^post_id or link.target_post_id == ^post_id
)
:ok
end
def list_outgoing_links(post_id) when is_binary(post_id) do
Repo.all(from link in Link, where: link.source_post_id == ^post_id, order_by: [asc: link.created_at])
end
def list_incoming_links(post_id) when is_binary(post_id) do
Repo.all(from link in Link, where: link.target_post_id == ^post_id, order_by: [asc: link.created_at])
end
defp post_body(%Post{content: content}) when is_binary(content), do: content
defp post_body(%Post{project_id: project_id, file_path: file_path}) do
if file_path in [nil, ""] do
""
else
project = Projects.get_project!(project_id)
full_path = Path.join(Projects.project_data_dir(project), file_path)
case File.read(full_path) do
{:ok, contents} ->
case String.split(contents, "\n---\n", parts: 2) do
[_frontmatter, body] -> String.trim_trailing(body, "\n")
_parts -> contents
end
{:error, _reason} ->
""
end
end
end
defp extract_links(body) when is_binary(body) do
markdown_links =
Regex.scan(~r/\[([^\]]+)\]\(([^)]+)\)/, body)
|> Enum.map(fn [_full, link_text, href] -> %{link_text: normalize_link_text(link_text), href: href} end)
html_links =
Regex.scan(~r/<a\s+[^>]*href=["']([^"']+)["'][^>]*>(.*?)<\/a>/is, body)
|> Enum.map(fn [_full, href, link_text] -> %{link_text: normalize_link_text(link_text), href: href} end)
markdown_links ++ html_links
end
defp resolve_post_link(project_id, %{href: href, link_text: link_text}) do
path =
href
|> to_string()
|> String.trim()
|> URI.parse()
|> Map.get(:path)
with path when is_binary(path) <- path,
slug when is_binary(slug) <- extract_slug(path),
%Post{id: target_post_id} <- Repo.get_by(Post, project_id: project_id, slug: slug) do
%{target_post_id: target_post_id, link_text: link_text}
else
_ -> nil
end
end
defp extract_slug(path) do
segments = path |> String.split("/", trim: true)
case segments do
[year, month, day, slug] ->
if numeric_year?(year) and numeric_month_or_day?(month) and numeric_month_or_day?(day),
do: slug,
else: nil
[language, year, month, day, slug] ->
if language_code?(language) and numeric_year?(year) and numeric_month_or_day?(month) and
numeric_month_or_day?(day),
do: slug,
else: nil
[slug] -> slug
[language, slug] -> if(language_code?(language), do: slug, else: nil)
_other -> nil
end
end
defp numeric_year?(value), do: String.match?(value, ~r/^\d{4}$/)
defp numeric_month_or_day?(value), do: String.match?(value, ~r/^\d{2}$/)
defp language_code?(value), do: String.match?(value, ~r/^[a-z]{2}$/)
defp normalize_link_text(value) do
value
|> to_string()
|> String.replace(~r/<[^>]+>/, "")
|> String.trim()
|> case do
"" -> nil
trimmed -> trimmed
end
end
end

View File

@@ -4,7 +4,9 @@ defmodule BDS.Posts do
import Ecto.Query import Ecto.Query
alias BDS.Frontmatter alias BDS.Frontmatter
alias BDS.Embeddings
alias BDS.Metadata alias BDS.Metadata
alias BDS.PostLinks
alias BDS.Posts.Post alias BDS.Posts.Post
alias BDS.Posts.Translation alias BDS.Posts.Translation
alias BDS.Projects alias BDS.Projects
@@ -47,6 +49,7 @@ defmodule BDS.Posts do
|> Repo.insert() |> Repo.insert()
|> case do |> case do
{:ok, post} -> {:ok, post} ->
:ok = Embeddings.sync_post(post)
:ok = Search.sync_post(post) :ok = Search.sync_post(post)
{:ok, post} {:ok, post}
@@ -75,6 +78,8 @@ defmodule BDS.Posts do
|> Repo.update() |> Repo.update()
|> case do |> case do
{:ok, updated_post} -> {:ok, updated_post} ->
:ok = Embeddings.sync_post(updated_post)
:ok = PostLinks.sync_post_links(updated_post)
:ok = Search.sync_post(updated_post) :ok = Search.sync_post(updated_post)
{:ok, updated_post} {:ok, updated_post}
@@ -119,7 +124,9 @@ defmodule BDS.Posts do
|> Repo.update() |> Repo.update()
|> case do |> case do
{:ok, updated_post} -> {:ok, updated_post} ->
:ok = Embeddings.sync_post(updated_post)
:ok = publish_post_translations(updated_post) :ok = publish_post_translations(updated_post)
:ok = PostLinks.sync_post_links(updated_post)
:ok = Search.sync_post(updated_post) :ok = Search.sync_post(updated_post)
{:ok, updated_post} {:ok, updated_post}
@@ -149,6 +156,8 @@ defmodule BDS.Posts do
%Post{} = post -> %Post{} = post ->
delete_post_file(post) delete_post_file(post)
:ok = Embeddings.remove_post(post.id)
:ok = PostLinks.delete_post_links(post.id)
Repo.delete!(post) Repo.delete!(post)
:ok = Search.delete_post(post.id) :ok = Search.delete_post(post.id)
{:ok, :deleted} {:ok, :deleted}

25
lib/bds/posts/link.ex Normal file
View File

@@ -0,0 +1,25 @@
defmodule BDS.Posts.Link do
@moduledoc false
use Ecto.Schema
import Ecto.Changeset
@primary_key {:id, :string, autogenerate: false}
@foreign_key_type :string
schema "post_links" do
belongs_to :source_post, BDS.Posts.Post, foreign_key: :source_post_id, references: :id, type: :string
belongs_to :target_post, BDS.Posts.Post, foreign_key: :target_post_id, references: :id, type: :string
field :link_text, :string
field :created_at, :integer
end
def changeset(link, attrs) do
link
|> cast(attrs, [:id, :source_post_id, :target_post_id, :link_text, :created_at])
|> validate_required([:id, :source_post_id, :target_post_id, :created_at])
|> foreign_key_constraint(:source_post_id)
|> foreign_key_constraint(:target_post_id)
end
end

View File

@@ -5,16 +5,17 @@ defmodule BDS.Rendering do
alias BDS.Frontmatter alias BDS.Frontmatter
alias BDS.Media.Media, as: MediaAsset alias BDS.Media.Media, as: MediaAsset
alias BDS.Rendering.FileSystem
alias BDS.Menu alias BDS.Menu
alias BDS.Metadata alias BDS.Metadata
alias BDS.PostLinks
alias BDS.Projects alias BDS.Projects
alias BDS.Rendering.Filters
alias BDS.I18n alias BDS.I18n
alias BDS.Rendering.FileSystem
alias BDS.Rendering.Filters
alias BDS.Repo alias BDS.Repo
alias BDS.Tags.Tag
alias BDS.Posts.Post alias BDS.Posts.Post
alias BDS.Posts.Translation alias BDS.Posts.Translation
alias BDS.Tags.Tag
alias BDS.Templates.Template alias BDS.Templates.Template
def render_post_page(project_id, template_slug, assigns) def render_post_page(project_id, template_slug, assigns)
@@ -119,10 +120,14 @@ defmodule BDS.Rendering do
main_language = metadata.main_language || language main_language = metadata.main_language || language
post_record = load_post_record(assigns) post_record = load_post_record(assigns)
canonical_post = canonical_post_record(post_record)
post_id = canonical_post_id(post_record, assigns)
post_categories = Map.get(post_record || %{}, :categories, []) || [] post_categories = Map.get(post_record || %{}, :categories, []) || []
post_tags = Map.get(post_record || %{}, :tags, []) || [] post_tags = Map.get(post_record || %{}, :tags, []) || []
canonical_post_paths = canonical_post_path_by_slug(project_id, main_language) canonical_post_paths = canonical_post_path_by_slug(project_id, main_language)
canonical_media_paths = canonical_media_path_by_source_path(project_id) canonical_media_paths = canonical_media_path_by_source_path(project_id)
incoming_links = link_contexts(project_id, post_id, :incoming, main_language)
outgoing_links = link_contexts(project_id, post_id, :outgoing, main_language)
%{ %{
language: language, language: language,
@@ -141,18 +146,18 @@ defmodule BDS.Rendering do
pico_stylesheet_href: default_pico_stylesheet_href(), pico_stylesheet_href: default_pico_stylesheet_href(),
html_theme_attribute: html_theme_attribute(metadata.pico_theme), html_theme_attribute: html_theme_attribute(metadata.pico_theme),
blog_languages: blog_languages(metadata, language), blog_languages: blog_languages(metadata, language),
alternate_links: [], alternate_links: alternate_links(canonical_post, project_id, main_language),
menu_items: menu_items(project_id), menu_items: menu_items(project_id),
calendar_initial_year: calendar_initial_year(post_record), calendar_initial_year: calendar_initial_year(post_record),
calendar_initial_month: calendar_initial_month(post_record), calendar_initial_month: calendar_initial_month(post_record),
post_categories: post_categories, post_categories: post_categories,
post_tags: post_tags, post_tags: post_tags,
tag_color_by_name: tag_color_by_name(project_id), tag_color_by_name: tag_color_by_name(project_id),
backlinks: [], backlinks: backlinks(incoming_links),
canonical_post_path_by_slug: canonical_post_paths, canonical_post_path_by_slug: canonical_post_paths,
canonical_media_path_by_source_path: canonical_media_paths, canonical_media_path_by_source_path: canonical_media_paths,
post_data_json_by_id: post_data_json(assigns, post_record), post_data_json_by_id: post_data_json(assigns, post_record),
post: build_post_context(assigns, post_record) post: build_post_context(assigns, post_record, incoming_links, outgoing_links)
} }
end end
@@ -171,6 +176,10 @@ defmodule BDS.Rendering do
canonical_post_paths = canonical_post_path_by_slug(project_id, main_language) canonical_post_paths = canonical_post_path_by_slug(project_id, main_language)
canonical_media_paths = canonical_media_path_by_source_path(project_id) canonical_media_paths = canonical_media_path_by_source_path(project_id)
day_blocks = build_day_blocks(posts)
min_date = min_date(posts)
max_date = max_date(posts)
normalized_archive_context = normalize_archive_context(archive_context)
%{ %{
language: language, language: language,
@@ -189,10 +198,10 @@ defmodule BDS.Rendering do
menu_items: menu_items(project_id), menu_items: menu_items(project_id),
calendar_initial_year: calendar_initial_year_from_posts(posts), calendar_initial_year: calendar_initial_year_from_posts(posts),
calendar_initial_month: calendar_initial_month_from_posts(posts), calendar_initial_month: calendar_initial_month_from_posts(posts),
archive_context: normalize_archive_context(archive_context), archive_context: normalized_archive_context,
show_archive_range_heading: false, show_archive_range_heading: show_archive_range_heading?(normalized_archive_context, day_blocks),
min_date: nil, min_date: min_date,
max_date: nil, max_date: max_date,
is_list_page: true, is_list_page: true,
is_first_page: pagination.current_page <= 1, is_first_page: pagination.current_page <= 1,
is_last_page: pagination.current_page >= pagination.total_pages, is_last_page: pagination.current_page >= pagination.total_pages,
@@ -208,14 +217,7 @@ defmodule BDS.Rendering do
canonical_media_path_by_source_path: canonical_media_paths, canonical_media_path_by_source_path: canonical_media_paths,
post_data_json_by_id: post_data_json_by_id:
Enum.into(posts, %{}, fn post -> {post.id, post_data_json_value(post)} end), Enum.into(posts, %{}, fn post -> {post.id, post_data_json_value(post)} end),
day_blocks: [ day_blocks: day_blocks
%{
date_label: "",
show_date_marker: false,
show_separator: false,
posts: posts
}
]
} }
end end
@@ -332,12 +334,23 @@ defmodule BDS.Rendering do
end end
end end
defp canonical_post_record(%Translation{translation_for: post_id}), do: Repo.get(Post, post_id)
defp canonical_post_record(%Post{} = post), do: post
defp canonical_post_record(_other), do: nil
defp canonical_post_id(%Translation{translation_for: post_id}, _assigns), do: post_id
defp canonical_post_id(%Post{id: post_id}, _assigns), do: post_id
defp canonical_post_id(_post_record, assigns), do: Map.get(assigns, :id, Map.get(assigns, "id"))
defp post_data_json(assigns, post_record) do defp post_data_json(assigns, post_record) do
id = Map.get(assigns, :id, Map.get(assigns, "id")) id = Map.get(assigns, :id, Map.get(assigns, "id"))
if is_binary(id) do if is_binary(id) do
incoming_links = link_contexts(Map.get(post_record || %{}, :project_id), canonical_post_id(post_record, assigns), :incoming, Map.get(post_record || %{}, :language))
outgoing_links = link_contexts(Map.get(post_record || %{}, :project_id), canonical_post_id(post_record, assigns), :outgoing, Map.get(post_record || %{}, :language))
%{ %{
id => post_data_json_value(build_post_context(assigns, post_record)) id => post_data_json_value(build_post_context(assigns, post_record, incoming_links, outgoing_links))
} }
else else
%{} %{}
@@ -387,6 +400,58 @@ defmodule BDS.Rendering do
end) end)
end end
defp alternate_links(nil, _project_id, _main_language), do: []
defp alternate_links(%Post{} = post, project_id, main_language) do
translations =
Repo.all(
from translation in Translation,
where:
translation.project_id == ^project_id and
translation.translation_for == ^post.id and
translation.status == :published,
order_by: [asc: translation.language]
)
[%{href: post_path(post, nil), hreflang: normalize_language(post.language, main_language)}] ++
Enum.map(translations, fn translation ->
%{href: post_path(post, translation.language, main_language), hreflang: translation.language}
end)
end
defp backlinks(incoming_links) do
Enum.map(incoming_links, fn link ->
%{path: link.href, display_slug: link.display_slug, title: link.title}
end)
end
defp link_contexts(_project_id, nil, _direction, _main_language), do: []
defp link_contexts(project_id, post_id, :incoming, main_language) do
PostLinks.list_incoming_links(post_id)
|> Enum.map(&link_context(project_id, &1, :incoming, main_language))
|> Enum.reject(&is_nil/1)
end
defp link_contexts(project_id, post_id, :outgoing, main_language) do
PostLinks.list_outgoing_links(post_id)
|> Enum.map(&link_context(project_id, &1, :outgoing, main_language))
|> Enum.reject(&is_nil/1)
end
defp link_context(_project_id, link, direction, main_language) do
linked_post_id =
case direction do
:incoming -> link.source_post_id
:outgoing -> link.target_post_id
end
case Repo.get(Post, linked_post_id) do
nil -> nil
linked_post -> %{href: post_path(linked_post, nil), title: linked_post.title, display_slug: linked_post.slug, language: normalize_language(linked_post.language, main_language)}
end
end
defp canonical_media_path_by_source_path(project_id) do defp canonical_media_path_by_source_path(project_id) do
Repo.all(from media in MediaAsset, where: media.project_id == ^project_id) Repo.all(from media in MediaAsset, where: media.project_id == ^project_id)
|> Enum.reduce(%{}, fn media, acc -> |> Enum.reduce(%{}, fn media, acc ->
@@ -407,7 +472,7 @@ defmodule BDS.Rendering do
defp post_path(post, language_prefix) defp post_path(post, language_prefix)
when is_binary(language_prefix) and language_prefix != "" do when is_binary(language_prefix) and language_prefix != "" do
Path.join([String.trim_leading(language_prefix, "/"), post_path(post, nil)]) language_prefix <> post_path(post, nil)
end end
defp post_path(post, nil) do defp post_path(post, nil) do
@@ -444,20 +509,26 @@ defmodule BDS.Rendering do
), ),
excerpt: excerpt:
Map.get(post, :excerpt, Map.get(post, "excerpt", Map.get(post_record || %{}, :excerpt))), Map.get(post, :excerpt, Map.get(post, "excerpt", Map.get(post_record || %{}, :excerpt))),
author: Map.get(post_record || %{}, :author), author: Map.get(post, :author, Map.get(post, "author", Map.get(post_record || %{}, :author))),
language: language:
Map.get( Map.get(
post, post,
:language, :language,
Map.get(post, "language", Map.get(post_record || %{}, :language)) Map.get(post, "language", Map.get(post_record || %{}, :language))
), ),
published_at: Map.get(post_record || %{}, :published_at), published_at:
created_at: Map.get(post_record || %{}, :created_at), Map.get(post, :published_at, Map.get(post, "published_at", Map.get(post_record || %{}, :published_at))),
updated_at: Map.get(post_record || %{}, :updated_at), created_at:
tags: Map.get(post_record || %{}, :tags, []) || [], Map.get(post, :created_at, Map.get(post, "created_at", Map.get(post_record || %{}, :created_at))),
categories: Map.get(post_record || %{}, :categories, []) || [], updated_at:
template_slug: Map.get(post_record || %{}, :template_slug), Map.get(post, :updated_at, Map.get(post, "updated_at", Map.get(post_record || %{}, :updated_at))),
do_not_translate: Map.get(post_record || %{}, :do_not_translate, false), tags: Map.get(post, :tags, Map.get(post, "tags", Map.get(post_record || %{}, :tags, []))) || [],
categories:
Map.get(post, :categories, Map.get(post, "categories", Map.get(post_record || %{}, :categories, []))) || [],
template_slug:
Map.get(post, :template_slug, Map.get(post, "template_slug", Map.get(post_record || %{}, :template_slug))),
do_not_translate:
Map.get(post, :do_not_translate, Map.get(post, "do_not_translate", Map.get(post_record || %{}, :do_not_translate, false))),
href: Map.get(post, :href, Map.get(post, "href")), href: Map.get(post, :href, Map.get(post, "href")),
show_title: true, show_title: true,
linked_media: [], linked_media: [],
@@ -467,7 +538,7 @@ defmodule BDS.Rendering do
end) end)
end end
defp build_post_context(assigns, post_record) do defp build_post_context(assigns, post_record, incoming_links, outgoing_links) do
%{ %{
id: Map.get(assigns, :id, Map.get(assigns, "id")), id: Map.get(assigns, :id, Map.get(assigns, "id")),
slug: Map.get(assigns, :slug, Map.get(assigns, "slug")), slug: Map.get(assigns, :slug, Map.get(assigns, "slug")),
@@ -500,8 +571,8 @@ defmodule BDS.Rendering do
), ),
do_not_translate: Map.get(post_record || %{}, :do_not_translate, false), do_not_translate: Map.get(post_record || %{}, :do_not_translate, false),
linked_media: [], linked_media: [],
outgoing_links: [], outgoing_links: outgoing_links,
incoming_links: [] incoming_links: incoming_links
} }
end end
@@ -544,7 +615,56 @@ defmodule BDS.Rendering do
end end
defp normalize_archive_context(nil), do: nil defp normalize_archive_context(nil), do: nil
defp normalize_archive_context(%{} = archive_context), do: archive_context
defp normalize_archive_context(%{} = archive_context) do
%{
kind: Map.get(archive_context, :kind, Map.get(archive_context, "kind")),
name: Map.get(archive_context, :name, Map.get(archive_context, "name")),
month: Map.get(archive_context, :month, Map.get(archive_context, "month")),
year: Map.get(archive_context, :year, Map.get(archive_context, "year")),
day: Map.get(archive_context, :day, Map.get(archive_context, "day"))
}
end
defp build_day_blocks(posts) do
grouped_blocks =
posts
|> Enum.filter(&is_integer(Map.get(&1, :created_at)))
|> Enum.group_by(&DateTime.from_unix!(Map.get(&1, :created_at)) |> DateTime.to_date() |> Date.to_iso8601())
|> Enum.sort_by(fn {label, _posts} -> label end)
grouped_blocks
|> Enum.with_index()
|> Enum.map(fn {{date_label, grouped_posts}, index} ->
%{
date_label: date_label,
show_date_marker: true,
show_separator: index < length(grouped_blocks) - 1,
posts: Enum.sort_by(grouped_posts, &Map.get(&1, :created_at))
}
end)
|> case do
[] -> [%{date_label: "", show_date_marker: false, show_separator: false, posts: posts}]
blocks -> blocks
end
end
defp min_date(posts) do
posts
|> Enum.map(&Map.get(&1, :created_at))
|> Enum.filter(&is_integer/1)
|> Enum.min(fn -> nil end)
end
defp max_date(posts) do
posts
|> Enum.map(&Map.get(&1, :created_at))
|> Enum.filter(&is_integer/1)
|> Enum.max(fn -> nil end)
end
defp show_archive_range_heading?(%{kind: "date"}, _day_blocks), do: true
defp show_archive_range_heading?(_archive_context, _day_blocks), do: false
defp html_theme_attribute(nil), do: nil defp html_theme_attribute(nil), do: nil
defp html_theme_attribute(""), do: nil defp html_theme_attribute(""), do: nil
@@ -574,4 +694,15 @@ defmodule BDS.Rendering do
defp language_prefix(language, main_language) when language == main_language, do: "" defp language_prefix(language, main_language) when language == main_language, do: ""
defp language_prefix(nil, _main_language), do: "" defp language_prefix(nil, _main_language), do: ""
defp language_prefix(language, _main_language), do: "/#{language}" defp language_prefix(language, _main_language), do: "/#{language}"
defp normalize_language(nil, fallback), do: fallback
defp normalize_language("", fallback), do: fallback
defp normalize_language(language, _fallback) do
language
|> to_string()
|> String.downcase()
|> String.split("-", parts: 2)
|> hd()
end
end end

View File

@@ -0,0 +1,106 @@
defmodule BDS.EmbeddingsTest do
use ExUnit.Case, async: false
setup do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo)
temp_dir = Path.join(System.tmp_dir!(), "bds-embeddings-#{System.unique_integer([:positive])}")
File.mkdir_p!(temp_dir)
on_exit(fn -> File.rm_rf(temp_dir) end)
{:ok, project} = BDS.Projects.create_project(%{name: "Embeddings", data_path: temp_dir})
%{project: project}
end
test "embeddings index published posts when semantic similarity is enabled and support similarity, duplicates, dismissals, and tag suggestions",
%{project: project} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{semantic_similarity_enabled: true})
assert {:ok, alpha} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Space Travel",
content: "space rocket launch orbit mission galaxy",
tags: ["space", "science"],
language: "en"
})
assert {:ok, beta} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Rocket Mission",
content: "rocket launch mission orbit space station",
tags: ["space", "mission"],
language: "en"
})
assert {:ok, gamma} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Bread Baking",
content: "flour yeast dough oven loaf kitchen",
tags: ["food"],
language: "en"
})
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
assert {:ok, beta} = BDS.Posts.publish_post(beta.id)
assert {:ok, gamma} = BDS.Posts.publish_post(gamma.id)
assert {:ok, indexed} = BDS.Embeddings.index_unindexed(project.id)
assert Enum.sort(indexed) == Enum.sort([alpha.id, beta.id, gamma.id])
assert {:ok, similar} = BDS.Embeddings.find_similar(alpha.id, 2)
assert length(similar) == 2
assert hd(similar).post_id == beta.id
assert hd(similar).score > List.last(similar).score
assert {:ok, scores} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id])
assert scores[beta.id] > scores[gamma.id]
assert {:ok, suggestions} = BDS.Embeddings.suggest_tags(alpha.id, "rocket orbit mission")
assert "space" in suggestions
assert {:ok, duplicates} = BDS.Embeddings.find_duplicates(project.id)
assert Enum.any?(duplicates, fn pair ->
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
end)
assert {:ok, dismissal} = BDS.Embeddings.dismiss_duplicate_pair(alpha.id, beta.id)
assert dismissal.project_id == project.id
assert {:ok, filtered_duplicates} = BDS.Embeddings.find_duplicates(project.id)
refute Enum.any?(filtered_duplicates, fn pair ->
MapSet.new([pair.post_id_a, pair.post_id_b]) == MapSet.new([alpha.id, beta.id])
end)
assert {:ok, alpha} = BDS.Posts.update_post(alpha.id, %{content: "kitchen flour dough loaf"})
assert {:ok, alpha} = BDS.Posts.publish_post(alpha.id)
assert {:ok, updated_scores} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id])
assert updated_scores[gamma.id] > updated_scores[beta.id]
assert {:ok, :deleted} = BDS.Posts.delete_post(gamma.id)
assert {:ok, after_delete} = BDS.Embeddings.compute_similarities(alpha.id, [beta.id, gamma.id])
refute Map.has_key?(after_delete, gamma.id)
end
test "embedding queries are gated off when semantic similarity is disabled", %{project: project} do
assert {:ok, post} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Disabled",
content: "space rocket mission"
})
assert {:ok, post} = BDS.Posts.publish_post(post.id)
assert {:ok, []} = BDS.Embeddings.find_similar(post.id, 5)
assert {:ok, []} = BDS.Embeddings.find_duplicates(project.id)
assert {:ok, %{}} = BDS.Embeddings.compute_similarities(post.id, [post.id])
end
end

View File

@@ -0,0 +1,98 @@
defmodule BDS.PostLinksTest do
use ExUnit.Case, async: false
alias BDS.Repo
setup do
:ok = Ecto.Adapters.SQL.Sandbox.checkout(BDS.Repo)
temp_dir = Path.join(System.tmp_dir!(), "bds-post-links-#{System.unique_integer([:positive])}")
File.mkdir_p!(temp_dir)
on_exit(fn -> File.rm_rf(temp_dir) end)
{:ok, project} = BDS.Projects.create_project(%{name: "Links", data_path: temp_dir})
%{project: project, temp_dir: temp_dir}
end
test "publishing and updating posts sync outgoing post links and deleting a post removes them", %{
project: project
} do
assert {:ok, target} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Target Post",
content: "target body"
})
assert {:ok, target} = BDS.Posts.publish_post(target.id)
target_href = canonical_post_href(target)
assert {:ok, source} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Source Post",
content: "See [Target](#{target_href})"
})
assert {:ok, source} = BDS.Posts.publish_post(source.id)
assert post_links() == [
%{
source_post_id: source.id,
target_post_id: target.id,
link_text: "Target"
}
]
assert {:ok, source} =
BDS.Posts.update_post(source.id, %{
content: "A revised body without a post reference"
})
assert source.status == :draft
assert post_links() == []
assert {:ok, source} =
BDS.Posts.update_post(source.id, %{
content: "Now [Target Again](#{target_href})"
})
assert {:ok, source} = BDS.Posts.publish_post(source.id)
assert post_links() == [
%{
source_post_id: source.id,
target_post_id: target.id,
link_text: "Target Again"
}
]
assert {:ok, :deleted} = BDS.Posts.delete_post(target.id)
assert post_links() == []
end
defp canonical_post_href(post) do
datetime = DateTime.from_unix!(post.created_at)
Path.join([
"",
Integer.to_string(datetime.year),
String.pad_leading(Integer.to_string(datetime.month), 2, "0"),
String.pad_leading(Integer.to_string(datetime.day), 2, "0"),
post.slug,
""
])
end
defp post_links do
Repo.query!(
"SELECT source_post_id, target_post_id, link_text FROM post_links ORDER BY source_post_id, target_post_id",
[]
).rows
|> Enum.map(fn [source_post_id, target_post_id, link_text] ->
%{source_post_id: source_post_id, target_post_id: target_post_id, link_text: link_text}
end)
end
end

View File

@@ -72,6 +72,82 @@ defmodule BDS.RenderingTest do
assert rendered =~ "|2|1|#{published_template.slug}|true" assert rendered =~ "|2|1|#{published_template.slug}|true"
end end
test "render_post_page exposes alternate links, backlinks, and post link context", %{
project: project
} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{
main_language: "en",
blog_languages: ["en", "de"]
})
assert {:ok, template} =
BDS.Templates.create_template(%{
project_id: project.id,
title: "Render Link Context",
kind: :post,
content:
"alts={% for alt in alternate_links %}[{{ alt.hreflang }}={{ alt.href }}]{% endfor %}|backlinks={% for backlink in backlinks %}[{{ backlink.display_slug }}={{ backlink.title }}={{ backlink.path }}]{% endfor %}|outgoing={% for link in post.outgoing_links %}[{{ link.display_slug }}={{ link.title }}={{ link.href }}]{% endfor %}|incoming={% for link in post.incoming_links %}[{{ link.display_slug }}={{ link.title }}={{ link.href }}]{% endfor %}"
})
assert {:ok, published_template} = BDS.Templates.publish_template(template.id)
assert {:ok, target} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Linked Target",
content: "target body",
language: "en",
template_slug: published_template.slug
})
assert {:ok, _translation} =
BDS.Posts.upsert_post_translation(target.id, "de", %{
title: "Verlinktes Ziel",
content: "zieltext"
})
assert {:ok, target} = BDS.Posts.publish_post(target.id)
assert {:ok, source} =
BDS.Posts.create_post(%{
project_id: project.id,
title: "Linking Source",
content: "See [Target Link](#{canonical_post_href(target)})",
language: "en",
template_slug: published_template.slug
})
assert {:ok, source} = BDS.Posts.publish_post(source.id)
assert {:ok, rendered_target} =
Rendering.render_post_page(project.id, published_template.slug, %{
id: target.id,
title: target.title,
content: target.content || "",
slug: target.slug,
language: "en",
template_slug: published_template.slug
})
assert rendered_target =~ "alts=[en=#{canonical_post_href(target)}]"
assert rendered_target =~ "[de=/de#{canonical_post_href(target)}]"
assert rendered_target =~ "backlinks=[linking-source=Linking Source=#{canonical_post_href(source)}]"
assert rendered_target =~ "incoming=[linking-source=Linking Source=#{canonical_post_href(source)}]"
assert {:ok, rendered_source} =
Rendering.render_post_page(project.id, published_template.slug, %{
id: source.id,
title: source.title,
content: source.content || "",
slug: source.slug,
language: "en",
template_slug: published_template.slug
})
assert rendered_source =~ "outgoing=[linked-target=Linked Target=#{canonical_post_href(target)}]"
end
test "render_list_page exposes pagination and render_not_found_page localizes default copy", %{ test "render_list_page exposes pagination and render_not_found_page localizes default copy", %{
project: project project: project
} do } do
@@ -149,4 +225,95 @@ defmodule BDS.RenderingTest do
assert published_list_template.kind == :list assert published_list_template.kind == :list
end end
test "render_list_page groups posts into day blocks and exposes archive range fields", %{
project: project
} do
assert {:ok, _metadata} =
BDS.Metadata.update_project_metadata(project.id, %{
main_language: "en",
blog_languages: ["en"]
})
assert {:ok, list_template} =
BDS.Templates.create_template(%{
project_id: project.id,
title: "Render Day Blocks",
kind: :list,
content:
"range={{ min_date }}-{{ max_date }}|heading={{ show_archive_range_heading }}|blocks={% for block in day_blocks %}[{{ block.date_label }}:{{ block.posts.size }}:{{ block.show_date_marker }}]{% endfor %}|archive={{ archive_context.kind }}:{{ archive_context.year }}:{{ archive_context.month }}"
})
assert {:ok, published_list_template} = BDS.Templates.publish_template(list_template.id)
BDS.Repo.update_all(
from(template in BDS.Templates.Template,
where:
template.project_id == ^project.id and template.kind == :list and
template.id != ^published_list_template.id
),
set: [enabled: false]
)
first_day = 1_711_843_200
second_day = first_day + 86_400
posts = [
%{
id: "first",
slug: "first",
title: "First",
excerpt: "one",
language: "en",
created_at: first_day,
updated_at: first_day,
published_at: first_day,
tags: [],
categories: [],
href: "/2024/03/31/first/"
},
%{
id: "second",
slug: "second",
title: "Second",
excerpt: "two",
language: "en",
created_at: second_day,
updated_at: second_day,
published_at: second_day,
tags: [],
categories: [],
href: "/2024/04/01/second/"
}
]
assert {:ok, rendered} =
Rendering.render_list_page(project.id, %{
language: "en",
page_title: "Archive",
posts: posts,
archive_context: %{kind: "date", year: 2024, month: 4},
pagination: %{
current_page: 1,
total_pages: 1,
total_items: 2,
items_per_page: 10,
has_prev_page: false,
prev_page_href: nil,
has_next_page: false,
next_page_href: nil
}
})
assert rendered =~ "heading=true"
assert rendered =~ "blocks=[2024-03-31:1:true][2024-04-01:1:true]"
assert rendered =~ "archive=date:2024:4"
assert rendered =~ "range=1711843200-1711929600"
end
defp canonical_post_href(post) do
datetime = DateTime.from_unix!(post.created_at)
"/#{datetime.year}/#{String.pad_leading(Integer.to_string(datetime.month), 2, "0")}/#{String.pad_leading(Integer.to_string(datetime.day), 2, "0")}/#{post.slug}/"
end
end end