defmodule BDS.WxrParser do @moduledoc false require Record Record.defrecord(:xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl")) Record.defrecord( :xmlAttribute, Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl") ) Record.defrecord(:xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl")) def parse_file(file_path) when is_binary(file_path) do file_path |> File.read!() |> parse_xml() end def parse_xml(xml_content) when is_binary(xml_content) do {document, _rest} = :xmerl_scan.string(String.to_charlist(xml_content)) case :xmerl_xpath.string(~c"/rss/channel", document) do [channel] -> %{ site: parse_site(channel), posts: parse_post_like_items(channel), pages: parse_items(channel, "page"), media: parse_media(channel), categories: parse_categories(channel), tags: parse_tags(channel) } _other -> raise RuntimeError, "Invalid WXR file: no element found" end end defp parse_site(channel) do %{ title: child_text(channel, "title"), link: child_text(channel, "link"), description: child_text(channel, "description"), language: child_text(channel, "language") } end defp parse_categories(channel) do channel |> direct_children() |> Enum.filter(&(full_name(&1) == "wp:category")) |> Enum.map(fn element -> %{ name: child_text(element, "cat_name"), slug: child_text(element, "category_nicename"), parent: child_text(element, "category_parent") } end) end defp parse_tags(channel) do channel |> direct_children() |> Enum.filter(&(full_name(&1) == "wp:tag")) |> Enum.map(fn element -> %{ name: child_text(element, "tag_name"), slug: child_text(element, "tag_slug") } end) end defp parse_items(channel, expected_type) do channel |> direct_children_named("item") |> Enum.filter(&(child_text(&1, "post_type") == expected_type)) |> Enum.map(&parse_post_item/1) end defp parse_post_like_items(channel) do channel |> direct_children_named("item") |> Enum.filter(fn item -> type = child_text(item, "post_type") type not in ["", "attachment", "page"] end) |> Enum.map(&parse_post_item/1) end defp parse_media(channel) do channel |> direct_children_named("item") |> Enum.filter(&(child_text(&1, "post_type") == "attachment")) |> Enum.map(&parse_media_item/1) end defp parse_post_item(item) do %{ wp_id: parse_integer(child_text(item, "post_id")), title: child_text(item, "title"), slug: child_text(item, "post_name"), content: child_text_by_full_name(item, "content:encoded"), excerpt: child_text_by_full_name(item, "excerpt:encoded"), pub_date: blank_to_nil(child_text(item, "pubDate")), post_date: blank_to_nil(child_text(item, "post_date")), post_modified: blank_to_nil(child_text(item, "post_modified")), creator: child_text_by_full_name(item, "dc:creator"), status: child_text(item, "status"), post_type: child_text(item, "post_type"), categories: item_taxonomy(item, "category"), tags: item_taxonomy(item, "post_tag") } end defp parse_media_item(item) do attachment_url = child_text(item, "attachment_url") filename = attachment_url |> Path.basename() |> blank_to_nil() || "" %{ wp_id: parse_integer(child_text(item, "post_id")), title: child_text(item, "title"), url: attachment_url, filename: filename, relative_path: relative_upload_path(attachment_url), pub_date: blank_to_nil(child_text(item, "pubDate")), parent_id: parse_integer(child_text(item, "post_parent")), mime_type: MIME.from_path(filename), description: child_text_by_full_name(item, "content:encoded") } end defp item_taxonomy(item, domain) do item |> direct_children_named("category") |> Enum.filter(&(xml_attr(&1, :domain) == domain)) |> Enum.map(&text_content/1) |> Enum.reject(&(&1 == "")) end defp relative_upload_path(url) when is_binary(url) do marker = "/wp-content/uploads/" case String.split(url, marker, parts: 2) do [_prefix, suffix] -> suffix _other -> Path.basename(url) end end defp direct_children(element) do Enum.filter(xmlElement(element, :content), fn child -> is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlElement end) end defp direct_children_named(element, name) do Enum.filter(direct_children(element), &(local_name(&1) == name)) end defp child_text(element, name) do element |> direct_children_named(name) |> List.first() |> text_content() end defp child_text_by_full_name(element, name) do element |> direct_children() |> Enum.find(&(full_name(&1) == name)) |> text_content() end defp text_content(nil), do: "" defp text_content(element) do element |> xmlElement(:content) |> Enum.map_join("", fn child when is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlText -> child |> xmlText(:value) |> to_string() child when is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlElement -> text_content(child) _other -> "" end) |> String.trim() end defp xml_attr(element, name) do element |> xmlElement(:attributes) |> Enum.find_value(fn attribute -> if xmlAttribute(attribute, :name) == name do attribute |> xmlAttribute(:value) |> to_string() end end) end defp full_name(element), do: element |> xmlElement(:name) |> to_string() defp local_name(element) do element |> full_name() |> String.split(":") |> List.last() end defp parse_integer(value) do case Integer.parse(to_string(value)) do {parsed, _rest} -> parsed :error -> 0 end end defp blank_to_nil(""), do: nil defp blank_to_nil(value), do: value end