%% HTML to text utilites
%% Convert plain HTML binary to plain text binary by extracting all text
%% nodes inside <body> tag.
%% TODO: detect encoding and convert to utf-8
html_to_text(Html) ->
Tree = try mochiweb_html:parse(Html)
catch _:_ -> []
end,
Body = case find_html_tag(Tree, <<"body">>) of
undefined ->
NewHtml = <<"<body>", Html/binary, "</body>">>,
%% html_to_text(NewHtml) ???
NewTree = mochiweb_html:parse(NewHtml),
find_html_tag(NewTree, <<"body">>);
{_, _, _} = El ->
El
end,
preorder_text(Body).
%% Find first occurence of tag `Tag'
%% preorder tree walking
%% TODO: show a.href value
find_html_tag({Tag, _, _} = Element, Tag) ->
Element;
find_html_tag({_, _, _} = Element, Tag) ->
find_html_tag([Element], Tag);
find_html_tag([{Tag, _, _} = Element | _], Tag) ->
Element;
find_html_tag([{_, _, Childs} | Rest], Tag) ->
case find_html_tag(Childs, Tag) of
undefined ->
find_html_tag(Rest, Tag);
Element ->
Element
end;
find_html_tag([_ | Rest], Tag) ->
find_html_tag(Rest, Tag);
find_html_tag([], _) ->
undefined.
preorder_text(Els) ->
preorder_text1(Els, true, <<>>).
preorder_text1(Els, AddSpaces, Acc) when is_list(Els) ->
lists:foldl(fun(El, Acc1) ->
preorder_text1(El, AddSpaces, Acc1)
end, Acc, Els);
preorder_text1({<<"a">>, Attrs, Childs}, AddSpaces, Acc) ->
case proplists:get_value(<<"href">>, Attrs) of
undefined -> preorder_text1(Childs, AddSpaces, Acc);
Href ->
Anchor = case preorder_text1(Childs, AddSpaces, <<>>) of
<<" ", Rest/binary>> -> Rest;
Other -> Other
end,
if Anchor == Href ->
<<Acc/binary, " ", Href/binary>>;
true ->
<<Acc/binary, " [", Anchor/binary, "](", Href/binary, ")">>
end
end;
preorder_text1({_El, _At, Childs}, AddSpaces, Acc) ->
preorder_text1(Childs, AddSpaces, Acc);
preorder_text1(Binary, true, Acc) when is_binary(Binary) ->
<<Acc/binary, " ", Binary/binary>>;
preorder_text1(Binary, false, Acc) when is_binary(Binary) ->
<<Acc/binary, Binary/binary>>;
preorder_text1(_, _, Acc) -> % comment/pi etc.
Acc.
-ifdef(TEST).
preorder_text_test() ->
Tree = {<<>>, [],
[<<"v1">>,
{<<>>, [],
[<<"v2">>,
{comment, <<"comment">>},
<<"v3">>,
<<"v4">>]},
<<"v5">>,
{<<>>, [], []},
<<"v6">>
]},
?assertEqual(<<" v1 v2 v3 v4 v5 v6">>, preorder_text1(Tree, true, <<>>)).
preorder_text_href_test() ->
Tree = {<<>>, [],
[<<"pre">>,
[<<"anchor">>]},
<<"post">>]},
preorder_text_href_is_anchor_test() ->
Tree = {<<>>, [],
[<<"pre">>,
<<"post">>]},
-endif.