60 lines
1.4 KiB
Elixir
60 lines
1.4 KiB
Elixir
defmodule BDS.Embeddings.Backends.InApp do
|
|
@moduledoc false
|
|
|
|
@behaviour BDS.Embeddings.Backend
|
|
|
|
@impl true
|
|
def model_info do
|
|
config = Application.get_env(:bds, :embeddings, [])
|
|
|
|
%{
|
|
model_id: Keyword.get(config, :model_id, "Xenova/multilingual-e5-small"),
|
|
dimensions: Keyword.get(config, :dimensions, 384)
|
|
}
|
|
end
|
|
|
|
@impl true
|
|
def embed(text, opts) when is_binary(text) and is_list(opts) do
|
|
language = Keyword.get(opts, :language)
|
|
dimensions = model_info().dimensions
|
|
|
|
vector =
|
|
text
|
|
|> BDS.Search.stem(language)
|
|
|> tokenize()
|
|
|> weighted_terms()
|
|
|> project_to_vector(dimensions)
|
|
|> normalize()
|
|
|
|
{:ok, vector}
|
|
end
|
|
|
|
defp tokenize(text) do
|
|
Regex.scan(~r/[[:alnum:]]+/u, String.downcase(text))
|
|
|> List.flatten()
|
|
end
|
|
|
|
defp weighted_terms(tokens) do
|
|
bigrams = tokens |> Enum.chunk_every(2, 1, :discard) |> Enum.map(&Enum.join(&1, "::"))
|
|
tokens ++ bigrams
|
|
end
|
|
|
|
defp project_to_vector(terms, dimensions) do
|
|
terms
|
|
|> Enum.reduce(:array.new(dimensions, default: 0.0), fn term, acc ->
|
|
index = :erlang.phash2(term, dimensions)
|
|
:array.set(index, :array.get(index, acc) + 1.0, acc)
|
|
end)
|
|
|> :array.to_list()
|
|
end
|
|
|
|
defp normalize(vector) do
|
|
norm = :math.sqrt(Enum.reduce(vector, 0.0, fn value, acc -> acc + value * value end))
|
|
|
|
if norm == 0.0 do
|
|
vector
|
|
else
|
|
Enum.map(vector, &(&1 / norm))
|
|
end
|
|
end
|
|
end |