fix: implement TD-05, replacement of XML parser
This commit is contained in:
@@ -1,143 +1,362 @@
|
||||
defmodule BDS.WxrParser do
|
||||
@moduledoc false
|
||||
|
||||
require Record
|
||||
defmodule ParserState do
|
||||
@moduledoc false
|
||||
|
||||
Record.defrecord(:xmlElement, Record.extract(:xmlElement, from_lib: "xmerl/include/xmerl.hrl"))
|
||||
|
||||
Record.defrecord(
|
||||
:xmlAttribute,
|
||||
Record.extract(:xmlAttribute, from_lib: "xmerl/include/xmerl.hrl")
|
||||
)
|
||||
|
||||
Record.defrecord(:xmlText, Record.extract(:xmlText, from_lib: "xmerl/include/xmerl.hrl"))
|
||||
|
||||
def parse_file(file_path) when is_binary(file_path) do
|
||||
file_path
|
||||
|> File.read!()
|
||||
|> parse_xml()
|
||||
defstruct stack: [],
|
||||
channel_seen?: false,
|
||||
site: %{title: "", link: "", description: "", language: ""},
|
||||
categories: [],
|
||||
tags: [],
|
||||
items: [],
|
||||
current_category: nil,
|
||||
current_tag: nil,
|
||||
current_item: nil,
|
||||
current_taxonomy: nil,
|
||||
text: ""
|
||||
end
|
||||
|
||||
def parse_xml(xml_content) when is_binary(xml_content) do
|
||||
{document, _rest} = :xmerl_scan.string(String.to_charlist(xml_content))
|
||||
defmodule Handler do
|
||||
@moduledoc false
|
||||
|
||||
case :xmerl_xpath.string(~c"/rss/channel", document) do
|
||||
[channel] ->
|
||||
%{
|
||||
site: parse_site(channel),
|
||||
posts: parse_post_like_items(channel),
|
||||
pages: parse_items(channel, "page"),
|
||||
media: parse_media(channel),
|
||||
categories: parse_categories(channel),
|
||||
tags: parse_tags(channel)
|
||||
}
|
||||
@behaviour Saxy.Handler
|
||||
|
||||
_other ->
|
||||
raise RuntimeError, "Invalid WXR file: no <channel> element found"
|
||||
alias BDS.WxrParser.ParserState
|
||||
|
||||
def handle_event(:start_document, _prolog, state), do: {:ok, state}
|
||||
|
||||
def handle_event(:end_document, _data, state), do: {:ok, state}
|
||||
|
||||
def handle_event(:characters, chars, state) do
|
||||
{:ok, %{state | text: state.text <> chars}}
|
||||
end
|
||||
|
||||
def handle_event(:start_element, {name, attributes}, state) do
|
||||
parent = current_name(state)
|
||||
|
||||
state =
|
||||
state
|
||||
|> push_name(name)
|
||||
|> reset_text()
|
||||
|> maybe_start_channel(parent, name)
|
||||
|> maybe_start_category(parent, name)
|
||||
|> maybe_start_tag(parent, name)
|
||||
|> maybe_start_item(parent, name)
|
||||
|> maybe_start_item_taxonomy(parent, name, attributes)
|
||||
|
||||
{:ok, state}
|
||||
end
|
||||
|
||||
def handle_event(:end_element, name, state) do
|
||||
parent = parent_name(state)
|
||||
text = String.trim(state.text)
|
||||
|
||||
state =
|
||||
state
|
||||
|> maybe_capture_site_field(parent, name, text)
|
||||
|> maybe_capture_category_field(parent, name, text)
|
||||
|> maybe_finish_category(parent, name)
|
||||
|> maybe_capture_tag_field(parent, name, text)
|
||||
|> maybe_finish_tag(parent, name)
|
||||
|> maybe_capture_item_field(parent, name, text)
|
||||
|> maybe_finish_item_taxonomy(parent, name, text)
|
||||
|> maybe_finish_item(parent, name)
|
||||
|> pop_name()
|
||||
|> reset_text()
|
||||
|
||||
{:ok, state}
|
||||
end
|
||||
|
||||
defp current_name(%ParserState{stack: [name | _rest]}), do: name
|
||||
defp current_name(%ParserState{}), do: nil
|
||||
|
||||
defp parent_name(%ParserState{stack: [_current, parent | _rest]}), do: parent
|
||||
defp parent_name(%ParserState{}), do: nil
|
||||
|
||||
defp push_name(state, name), do: %{state | stack: [name | state.stack]}
|
||||
|
||||
defp pop_name(%ParserState{stack: [_name | rest]} = state), do: %{state | stack: rest}
|
||||
defp pop_name(state), do: state
|
||||
|
||||
defp reset_text(state), do: %{state | text: ""}
|
||||
|
||||
defp maybe_start_channel(state, "rss", "channel"), do: %{state | channel_seen?: true}
|
||||
defp maybe_start_channel(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_start_category(state, "channel", "wp:category") do
|
||||
%{state | current_category: %{name: "", slug: "", parent: ""}}
|
||||
end
|
||||
|
||||
defp maybe_start_category(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_start_tag(state, "channel", "wp:tag") do
|
||||
%{state | current_tag: %{name: "", slug: ""}}
|
||||
end
|
||||
|
||||
defp maybe_start_tag(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_start_item(state, "channel", "item") do
|
||||
%{state | current_item: empty_item()}
|
||||
end
|
||||
|
||||
defp maybe_start_item(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_start_item_taxonomy(state, "item", "category", attributes) do
|
||||
%{state | current_taxonomy: %{domain: attribute_value(attributes, "domain")}}
|
||||
end
|
||||
|
||||
defp maybe_start_item_taxonomy(state, _parent, _name, _attributes), do: state
|
||||
|
||||
defp maybe_capture_site_field(
|
||||
%ParserState{current_category: nil, current_tag: nil, current_item: nil} = state,
|
||||
"channel",
|
||||
name,
|
||||
text
|
||||
) do
|
||||
case name do
|
||||
"title" -> put_in(state.site.title, text)
|
||||
"link" -> put_in(state.site.link, text)
|
||||
"description" -> put_in(state.site.description, text)
|
||||
"language" -> put_in(state.site.language, text)
|
||||
_other -> state
|
||||
end
|
||||
end
|
||||
|
||||
defp maybe_capture_site_field(state, _parent, _name, _text), do: state
|
||||
|
||||
defp maybe_capture_category_field(%ParserState{current_category: nil} = state, _parent, _name, _text),
|
||||
do: state
|
||||
|
||||
defp maybe_capture_category_field(%ParserState{} = state, "wp:category", name, text) do
|
||||
key =
|
||||
case name do
|
||||
"wp:cat_name" -> :name
|
||||
"wp:category_nicename" -> :slug
|
||||
"wp:category_parent" -> :parent
|
||||
_other -> nil
|
||||
end
|
||||
|
||||
if key do
|
||||
update_in(state.current_category, &Map.put(&1, key, text))
|
||||
else
|
||||
state
|
||||
end
|
||||
end
|
||||
|
||||
defp maybe_capture_category_field(state, _parent, _name, _text), do: state
|
||||
|
||||
defp maybe_finish_category(%ParserState{current_category: nil} = state, _parent, _name), do: state
|
||||
|
||||
defp maybe_finish_category(%ParserState{} = state, "channel", "wp:category") do
|
||||
%{state | categories: [state.current_category | state.categories], current_category: nil}
|
||||
end
|
||||
|
||||
defp maybe_finish_category(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_capture_tag_field(%ParserState{current_tag: nil} = state, _parent, _name, _text),
|
||||
do: state
|
||||
|
||||
defp maybe_capture_tag_field(%ParserState{} = state, "wp:tag", name, text) do
|
||||
key =
|
||||
case name do
|
||||
"wp:tag_name" -> :name
|
||||
"wp:tag_slug" -> :slug
|
||||
_other -> nil
|
||||
end
|
||||
|
||||
if key do
|
||||
update_in(state.current_tag, &Map.put(&1, key, text))
|
||||
else
|
||||
state
|
||||
end
|
||||
end
|
||||
|
||||
defp maybe_capture_tag_field(state, _parent, _name, _text), do: state
|
||||
|
||||
defp maybe_finish_tag(%ParserState{current_tag: nil} = state, _parent, _name), do: state
|
||||
|
||||
defp maybe_finish_tag(%ParserState{} = state, "channel", "wp:tag") do
|
||||
%{state | tags: [state.current_tag | state.tags], current_tag: nil}
|
||||
end
|
||||
|
||||
defp maybe_finish_tag(state, _parent, _name), do: state
|
||||
|
||||
defp maybe_capture_item_field(%ParserState{current_item: nil} = state, _parent, _name, _text),
|
||||
do: state
|
||||
|
||||
defp maybe_capture_item_field(%ParserState{} = state, "item", name, text) do
|
||||
key =
|
||||
case name do
|
||||
"title" -> :title
|
||||
"pubDate" -> :pub_date
|
||||
"dc:creator" -> :creator
|
||||
"content:encoded" -> :content
|
||||
"excerpt:encoded" -> :excerpt
|
||||
"wp:post_id" -> :post_id
|
||||
"wp:post_date" -> :post_date
|
||||
"wp:post_modified" -> :post_modified
|
||||
"wp:post_name" -> :post_name
|
||||
"wp:status" -> :status
|
||||
"wp:post_type" -> :post_type
|
||||
"wp:post_parent" -> :post_parent
|
||||
"wp:attachment_url" -> :attachment_url
|
||||
_other -> nil
|
||||
end
|
||||
|
||||
if key do
|
||||
update_in(state.current_item, &Map.put(&1, key, text))
|
||||
else
|
||||
state
|
||||
end
|
||||
end
|
||||
|
||||
defp maybe_capture_item_field(state, _parent, _name, _text), do: state
|
||||
|
||||
defp maybe_finish_item_taxonomy(
|
||||
%ParserState{current_item: nil, current_taxonomy: nil} = state,
|
||||
_parent,
|
||||
_name,
|
||||
_text
|
||||
),
|
||||
do: state
|
||||
|
||||
defp maybe_finish_item_taxonomy(%ParserState{current_taxonomy: nil} = state, _parent, _name, _text),
|
||||
do: state
|
||||
|
||||
defp maybe_finish_item_taxonomy(%ParserState{} = state, "item", "category", text) do
|
||||
domain = Map.get(state.current_taxonomy, :domain)
|
||||
|
||||
state =
|
||||
cond do
|
||||
text == "" -> state
|
||||
domain == "category" -> update_in(state.current_item.categories, &(&1 ++ [text]))
|
||||
domain == "post_tag" -> update_in(state.current_item.tags, &(&1 ++ [text]))
|
||||
true -> state
|
||||
end
|
||||
|
||||
%{state | current_taxonomy: nil}
|
||||
end
|
||||
|
||||
defp maybe_finish_item_taxonomy(state, _parent, _name, _text), do: state
|
||||
|
||||
defp maybe_finish_item(%ParserState{current_item: nil} = state, _parent, _name), do: state
|
||||
|
||||
defp maybe_finish_item(%ParserState{} = state, "channel", "item") do
|
||||
%{state | items: [state.current_item | state.items], current_item: nil}
|
||||
end
|
||||
|
||||
defp maybe_finish_item(state, _parent, _name), do: state
|
||||
|
||||
defp empty_item do
|
||||
%{
|
||||
post_id: "",
|
||||
title: "",
|
||||
post_name: "",
|
||||
content: "",
|
||||
excerpt: "",
|
||||
pub_date: "",
|
||||
post_date: "",
|
||||
post_modified: "",
|
||||
creator: "",
|
||||
status: "",
|
||||
post_type: "",
|
||||
post_parent: "",
|
||||
attachment_url: "",
|
||||
categories: [],
|
||||
tags: []
|
||||
}
|
||||
end
|
||||
|
||||
defp attribute_value(attributes, name) do
|
||||
Enum.find_value(attributes, fn
|
||||
{^name, value} -> value
|
||||
_other -> nil
|
||||
end)
|
||||
end
|
||||
end
|
||||
|
||||
defp parse_site(channel) do
|
||||
def parse_file(file_path) when is_binary(file_path) do
|
||||
file_path
|
||||
|> File.stream!(32_768, [])
|
||||
|> parse_document(&Saxy.parse_stream(&1, Handler, %ParserState{}))
|
||||
end
|
||||
|
||||
def parse_xml(xml_content) when is_binary(xml_content) do
|
||||
xml_content
|
||||
|> parse_document(&Saxy.parse_string(&1, Handler, %ParserState{}))
|
||||
end
|
||||
|
||||
defp parse_document(input, parser) do
|
||||
case parser.(input) do
|
||||
{:ok, %ParserState{channel_seen?: true} = state} ->
|
||||
build_result(state)
|
||||
|
||||
{:ok, %ParserState{channel_seen?: false}} ->
|
||||
raise RuntimeError, "Invalid WXR file: no <channel> element found"
|
||||
|
||||
{:error, error} ->
|
||||
raise RuntimeError, Exception.message(error)
|
||||
end
|
||||
end
|
||||
|
||||
defp build_result(%ParserState{} = state) do
|
||||
items = Enum.reverse(state.items)
|
||||
|
||||
%{
|
||||
title: child_text(channel, "title"),
|
||||
link: child_text(channel, "link"),
|
||||
description: child_text(channel, "description"),
|
||||
language: child_text(channel, "language")
|
||||
site: state.site,
|
||||
posts:
|
||||
items
|
||||
|> Enum.filter(fn item -> item.post_type not in ["", "attachment", "page"] end)
|
||||
|> Enum.map(&parse_post_item/1),
|
||||
pages:
|
||||
items
|
||||
|> Enum.filter(&(&1.post_type == "page"))
|
||||
|> Enum.map(&parse_post_item/1),
|
||||
media:
|
||||
items
|
||||
|> Enum.filter(&(&1.post_type == "attachment"))
|
||||
|> Enum.map(&parse_media_item/1),
|
||||
categories: Enum.reverse(state.categories),
|
||||
tags: Enum.reverse(state.tags)
|
||||
}
|
||||
end
|
||||
|
||||
defp parse_categories(channel) do
|
||||
channel
|
||||
|> direct_children()
|
||||
|> Enum.filter(&(full_name(&1) == "wp:category"))
|
||||
|> Enum.map(fn element ->
|
||||
%{
|
||||
name: child_text(element, "cat_name"),
|
||||
slug: child_text(element, "category_nicename"),
|
||||
parent: child_text(element, "category_parent")
|
||||
}
|
||||
end)
|
||||
end
|
||||
|
||||
defp parse_tags(channel) do
|
||||
channel
|
||||
|> direct_children()
|
||||
|> Enum.filter(&(full_name(&1) == "wp:tag"))
|
||||
|> Enum.map(fn element ->
|
||||
%{
|
||||
name: child_text(element, "tag_name"),
|
||||
slug: child_text(element, "tag_slug")
|
||||
}
|
||||
end)
|
||||
end
|
||||
|
||||
defp parse_items(channel, expected_type) do
|
||||
channel
|
||||
|> direct_children_named("item")
|
||||
|> Enum.filter(&(child_text(&1, "post_type") == expected_type))
|
||||
|> Enum.map(&parse_post_item/1)
|
||||
end
|
||||
|
||||
defp parse_post_like_items(channel) do
|
||||
channel
|
||||
|> direct_children_named("item")
|
||||
|> Enum.filter(fn item ->
|
||||
type = child_text(item, "post_type")
|
||||
type not in ["", "attachment", "page"]
|
||||
end)
|
||||
|> Enum.map(&parse_post_item/1)
|
||||
end
|
||||
|
||||
defp parse_media(channel) do
|
||||
channel
|
||||
|> direct_children_named("item")
|
||||
|> Enum.filter(&(child_text(&1, "post_type") == "attachment"))
|
||||
|> Enum.map(&parse_media_item/1)
|
||||
end
|
||||
|
||||
defp parse_post_item(item) do
|
||||
%{
|
||||
wp_id: parse_integer(child_text(item, "post_id")),
|
||||
title: child_text(item, "title"),
|
||||
slug: child_text(item, "post_name"),
|
||||
content: child_text_by_full_name(item, "content:encoded"),
|
||||
excerpt: child_text_by_full_name(item, "excerpt:encoded"),
|
||||
pub_date: blank_to_nil(child_text(item, "pubDate")),
|
||||
post_date: blank_to_nil(child_text(item, "post_date")),
|
||||
post_modified: blank_to_nil(child_text(item, "post_modified")),
|
||||
creator: child_text_by_full_name(item, "dc:creator"),
|
||||
status: child_text(item, "status"),
|
||||
post_type: child_text(item, "post_type"),
|
||||
categories: item_taxonomy(item, "category"),
|
||||
tags: item_taxonomy(item, "post_tag")
|
||||
wp_id: parse_integer(item.post_id),
|
||||
title: item.title,
|
||||
slug: item.post_name,
|
||||
content: item.content,
|
||||
excerpt: item.excerpt,
|
||||
pub_date: blank_to_nil(item.pub_date),
|
||||
post_date: blank_to_nil(item.post_date),
|
||||
post_modified: blank_to_nil(item.post_modified),
|
||||
creator: item.creator,
|
||||
status: item.status,
|
||||
post_type: item.post_type,
|
||||
categories: Enum.reject(item.categories, &(&1 == "")),
|
||||
tags: Enum.reject(item.tags, &(&1 == ""))
|
||||
}
|
||||
end
|
||||
|
||||
defp parse_media_item(item) do
|
||||
attachment_url = child_text(item, "attachment_url")
|
||||
attachment_url = item.attachment_url
|
||||
filename = attachment_url |> Path.basename() |> blank_to_nil() || ""
|
||||
|
||||
%{
|
||||
wp_id: parse_integer(child_text(item, "post_id")),
|
||||
title: child_text(item, "title"),
|
||||
wp_id: parse_integer(item.post_id),
|
||||
title: item.title,
|
||||
url: attachment_url,
|
||||
filename: filename,
|
||||
relative_path: relative_upload_path(attachment_url),
|
||||
pub_date: blank_to_nil(child_text(item, "pubDate")),
|
||||
parent_id: parse_integer(child_text(item, "post_parent")),
|
||||
pub_date: blank_to_nil(item.pub_date),
|
||||
parent_id: parse_integer(item.post_parent),
|
||||
mime_type: MIME.from_path(filename),
|
||||
description: child_text_by_full_name(item, "content:encoded")
|
||||
description: item.content
|
||||
}
|
||||
end
|
||||
|
||||
defp item_taxonomy(item, domain) do
|
||||
item
|
||||
|> direct_children_named("category")
|
||||
|> Enum.filter(&(xml_attr(&1, :domain) == domain))
|
||||
|> Enum.map(&text_content/1)
|
||||
|> Enum.reject(&(&1 == ""))
|
||||
end
|
||||
|
||||
defp relative_upload_path(url) when is_binary(url) do
|
||||
marker = "/wp-content/uploads/"
|
||||
|
||||
@@ -147,69 +366,6 @@ defmodule BDS.WxrParser do
|
||||
end
|
||||
end
|
||||
|
||||
defp direct_children(element) do
|
||||
Enum.filter(xmlElement(element, :content), fn child ->
|
||||
is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlElement
|
||||
end)
|
||||
end
|
||||
|
||||
defp direct_children_named(element, name) do
|
||||
Enum.filter(direct_children(element), &(local_name(&1) == name))
|
||||
end
|
||||
|
||||
defp child_text(element, name) do
|
||||
element
|
||||
|> direct_children_named(name)
|
||||
|> List.first()
|
||||
|> text_content()
|
||||
end
|
||||
|
||||
defp child_text_by_full_name(element, name) do
|
||||
element
|
||||
|> direct_children()
|
||||
|> Enum.find(&(full_name(&1) == name))
|
||||
|> text_content()
|
||||
end
|
||||
|
||||
defp text_content(nil), do: ""
|
||||
|
||||
defp text_content(element) do
|
||||
element
|
||||
|> xmlElement(:content)
|
||||
|> Enum.map_join("", fn
|
||||
child when is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlText ->
|
||||
child
|
||||
|> xmlText(:value)
|
||||
|> to_string()
|
||||
|
||||
child when is_tuple(child) and tuple_size(child) > 0 and elem(child, 0) == :xmlElement ->
|
||||
text_content(child)
|
||||
|
||||
_other ->
|
||||
""
|
||||
end)
|
||||
|> String.trim()
|
||||
end
|
||||
|
||||
defp xml_attr(element, name) do
|
||||
element
|
||||
|> xmlElement(:attributes)
|
||||
|> Enum.find_value(fn attribute ->
|
||||
if xmlAttribute(attribute, :name) == name do
|
||||
attribute |> xmlAttribute(:value) |> to_string()
|
||||
end
|
||||
end)
|
||||
end
|
||||
|
||||
defp full_name(element), do: element |> xmlElement(:name) |> to_string()
|
||||
|
||||
defp local_name(element) do
|
||||
element
|
||||
|> full_name()
|
||||
|> String.split(":")
|
||||
|> List.last()
|
||||
end
|
||||
|
||||
defp parse_integer(value) do
|
||||
case Integer.parse(to_string(value)) do
|
||||
{parsed, _rest} -> parsed
|
||||
|
||||
Reference in New Issue
Block a user