fix: A1-12 functional client-side search with real PagefindUI and fragment index

This commit is contained in:
2026-05-29 10:29:42 +02:00
parent 5b619f492a
commit babae1838d
23 changed files with 1038 additions and 485 deletions

View File

@@ -7,10 +7,25 @@ defmodule BDS.Generation.Pagefind do
@typedoc "A (relative_path, content) generated file tuple."
@type generated_file :: {String.t(), String.t()}
@assets_dir Application.app_dir(:bds, "priv/preview_assets/assets")
@ui_js_path Path.join(@assets_dir, "pagefind-ui.js")
@ui_css_path Path.join(@assets_dir, "pagefind-ui.css")
@external_resource @ui_js_path
@external_resource @ui_css_path
@ui_js File.read!(@ui_js_path)
@ui_css File.read!(@ui_css_path)
@doc """
Build the per-language Pagefind index outputs (`pagefind/index.json`,
`pagefind/pagefind-ui.js`, `pagefind/pagefind-ui.css`) for every blog
language declared on the plan.
The fragment index records one entry per indexable page, where indexable
means the page carries a `data-pagefind-body` region. Each entry stores the
page URL, its title, and the body text scoped to that region — mirroring
Pagefind's behaviour of ignoring content outside `data-pagefind-body`.
"""
@spec build_outputs(map(), [html_output()]) :: [generated_file()]
def build_outputs(plan, html_outputs) do
@@ -31,8 +46,8 @@ defmodule BDS.Generation.Pagefind do
[
{Path.join(prefix ++ ["index.json"]),
Jason.encode!(%{"language" => language, "pages" => pages})},
{Path.join(prefix ++ ["pagefind-ui.js"]), ui_js(language)},
{Path.join(prefix ++ ["pagefind-ui.css"]), ui_css()}
{Path.join(prefix ++ ["pagefind-ui.js"]), @ui_js},
{Path.join(prefix ++ ["pagefind-ui.css"]), @ui_css}
]
end)
end
@@ -43,11 +58,14 @@ defmodule BDS.Generation.Pagefind do
String.ends_with?(relative_path, ".html") and
language_match?(relative_path, route_language, other_prefixes)
end)
|> Enum.map(fn {relative_path, content} ->
%{
"url" => "/" <> relative_path,
"text" => text(content)
}
|> Enum.flat_map(fn {relative_path, content} ->
case body_text(content) do
nil ->
[]
text ->
[%{"url" => "/" <> relative_path, "title" => title(content), "text" => text}]
end
end)
end
@@ -60,19 +78,94 @@ defmodule BDS.Generation.Pagefind do
defp language_match?(relative_path, route_language, _other_prefixes),
do: String.starts_with?(relative_path, route_language <> "/")
defp text(content) do
content
# Extract the indexable body text scoped to the data-pagefind-body element.
# Returns nil when the page is not marked, so unmarked pages are excluded
# from the index entirely (matching Pagefind semantics).
defp body_text(content) do
case Regex.run(~r/<([a-zA-Z0-9]+)[^>]*\bdata-pagefind-body\b[^>]*>/, content,
return: :index
) do
[{open_start, open_len}, {tag_start, tag_len}] ->
tag = binary_part(content, tag_start, tag_len)
region = scoped_region(content, tag, open_start + open_len)
plain_text(region)
_no_match ->
nil
end
end
# Capture the inner HTML of the marked element by balancing same-tag
# open/close pairs from the opening tag onward.
defp scoped_region(content, tag, body_start) do
rest = binary_part(content, body_start, byte_size(content) - body_start)
open_re = Regex.compile!("<#{tag}\\b", "i")
close_re = Regex.compile!("</#{tag}\\s*>", "i")
events =
(Regex.scan(open_re, rest, return: :index) ++ Regex.scan(close_re, rest, return: :index))
|> Enum.map(fn [{pos, _len}] -> pos end)
|> Enum.map(fn pos -> {pos, event_kind(rest, pos, tag)} end)
|> Enum.sort_by(&elem(&1, 0))
close_at = balanced_close(events, 0)
case close_at do
nil -> rest
pos -> binary_part(rest, 0, pos)
end
end
defp event_kind(rest, pos, tag) do
if String.starts_with?(binary_part(rest, pos, min(2 + byte_size(tag), byte_size(rest) - pos)), "</") do
:close
else
:open
end
end
defp balanced_close([], _depth), do: nil
defp balanced_close([{pos, :close} | _rest], 0), do: pos
defp balanced_close([{_pos, :close} | rest], depth),
do: balanced_close(rest, depth - 1)
defp balanced_close([{_pos, :open} | rest], depth),
do: balanced_close(rest, depth + 1)
defp title(content) do
tag_text(content, ~r/<title[^>]*>(.*?)<\/title>/si) ||
tag_text(content, ~r/<h1[^>]*>(.*?)<\/h1>/si) ||
""
end
defp tag_text(content, regex) do
case Regex.run(regex, content) do
[_full, raw] -> raw |> plain_text() |> nil_if_blank()
_no_match -> nil
end
end
defp nil_if_blank(""), do: nil
defp nil_if_blank(value), do: value
defp plain_text(html) do
html
|> String.replace(~r/<[^>]+>/, " ")
|> decode_entities()
|> String.replace(~r/\s+/u, " ")
|> String.trim()
end
defp ui_js(language) do
"window.bDSPagefind = { language: #{Jason.encode!(language)} };\n"
end
defp ui_css do
".pagefind-ui{display:block;}\n"
defp decode_entities(text) do
text
|> String.replace("&amp;", "&")
|> String.replace("&lt;", "<")
|> String.replace("&gt;", ">")
|> String.replace("&quot;", "\"")
|> String.replace("&#39;", "'")
|> String.replace("&nbsp;", " ")
end
defp route_language(main_language, language) when main_language == language, do: nil