(** Functions to generate and search parsed html pages.
Uses the Nethtml module to do the heAvy lifting. *)
(** Author: Karl Zilles, released into public domain *)
open Nethtml
open Pcre
open Printf
open Utility
(** {1:Fix questionable "form" decision in dtd} *)
let fix_dtd dtd =
List.map (function
| (name, (elclass, _)) when name = "form" ->
(name, (elclass, `Sub_exclusions( ["form"], `Flow )))
| line -> line
) dtd
let my_dtd = fix_dtd relaxed_html40_dtd
(** {1:create Create an html tree} *)
let get_parsed_html_from_channel inchannel =
let parsed = List.hd (parse ~dtd:my_dtd inchannel) in
inchannel#close_in ();
parsed
(** [ get_parsed_html file ] returns an html tree of the contents of file.
See the Nethtml documentation for the format *)
let get_parsed_html file =
let inchannel = new Netchannels.input_channel (open_in file) in
get_parsed_html_from_channel inchannel
(** [ get_parsed_html file ] returns an html tree of the contents of file.
See the Nethtml documentation for the format *)
let get_parsed_html_from_string str =
let inchannel = new Netchannels.input_string str in
get_parsed_html_from_channel inchannel
(** {1:inspect Inspecting a tag} *)
(** [ match_tag tag doc] returns true if the current tag has type "tag".
Example: [match_tag "td" doc] will return true if doc is a "td" tag. *)
let match_tag thistag = function
| Element (tag,_,_) when tag = thistag -> true
| _ -> false
let tag_type = function
| Element (tag,_,_) -> tag
| _ -> raise Not_found
(** [ attribute doc attr ] returns the value of the attribute attr if it
exists in the current tag, or throws a [ Not_found ] exception if it doesn't *)
let attribute doc name = match doc with
| Element (_,attributes,_) ->
List.assoc name attributes
| _ -> raise Not_found
let as_text, as_text_list =
let rec as_text' = function
| Element (_,_,subdocs) -> as_text_list' subdocs
| Data text -> replace ~pat:"(\\r|\\n)" text
and as_text_list' l =
replace ~pat:"\\s+" ~templ:" "
(List.fold_left (fun current doc -> current ^
(as_text' doc)) "" l) in
(compose strip as_text', compose strip as_text_list')
(** [ as_text doc] returns a string of the text contents of this tag and
all of it's subtags*)
let as_text = as_text
(** [ as_text_list doc] returns a string of the text contents of all the tags
in this list and all of their subtags*)
let as_text_list = as_text_list
let rec as_text_formatted = function
| Element (_,_,subdocs) -> as_text_list_formatted subdocs
| Data text -> text
and as_text_list_formatted l =
(List.fold_left (fun current doc -> current ^
(as_text_formatted doc)) "" l)
(** [ as_html doc] returns a string of the html contents of all the tags
in this list and all of their subtags*)
let as_html_list documentlist =
let result = Buffer.create 2000 in
let outbuffer = new Netchannels.output_buffer result in
write ~dtd:my_dtd outbuffer documentlist;
Buffer.contents result
let as_html document = as_html_list [document]
(** {1:search Search or process an html tree} *)
(** [ iter_document f doc ] runs the function f on every tag in the document *)
let rec iter_document f doc =
f doc;
match doc with
| Element (_,_,subdocs) ->
List.iter (iter_document f) subdocs
| _ -> ()
let fold_left f = fold_left_from_iterator iter_document f
(** [ iter_document f doc ] runs the function f on every tag in the document.
In addition to passing the current element to f, it also passes a list
of all the parent tags of that element. *)
let iter_document_with_parents f doc =
let rec iter_document_with_parents' f parents doc =
f doc parents;
match doc with
| Element (_,_,subdocs) ->
List.iter (iter_document_with_parents' f (doc::parents))
subdocs
| _ -> ()
in
iter_document_with_parents' f [] doc
(** [ list_tags tagname doc ] returns a list of all tags of a certain type
in the html tree *)
let list_tags tagname doc =
let tags = ref [] in
iter_document (fun doc ->
if match_tag tagname doc then
tags := doc :: !tags) doc;
List.rev !tags
(** [ list_tags_with_parents tagname doc ] returns a list of all tags of a
certain type in the html tree, and also includes a list of their parents
with each one. *)
let list_tags_with_parents tagname doc =
let tags = ref [] in
iter_document_with_parents (fun doc parents ->
if match_tag tagname doc then
tags := (doc,parents) :: !tags) doc;
List.rev !tags
(** [ find filter doc ] returns the first element that filter returns
true on, searching the document in a depth first search. Raises a [ Not_found ]
exception if nothing is matched. *)
let find filter document =
let rec find' filter document =
if filter document then
Some document
else
match document with
| Element (_,_,subdocs) ->
let rec loop = (function
| h::t ->
(match find' filter h with
| Some x as result -> result
| None -> loop t)
| [] -> None) in
loop subdocs
| _ -> None in
match find' filter document with
| Some x -> x
| None -> raise Not_found
(** [ find filter doc ] returns the first element that filter returns
true on, searching the document in a depth first search. Note that in
this function we also pass along a list of parents to the filter, and
return the list of parents with the matching object. Raises a [ Not_found ]
exception if nothing is matched. *)
let find_with_parent filter document =
let rec find' filter parents document =
if filter parents document then
Some (document,parents)
else
match document with
| Element (_,_,subdocs) ->
let add_parents = document :: parents in
let rec loop = (function
| h::t ->
(match find' filter add_parents h with
| Some x as result -> result
| None -> loop t)
| [] -> None) in
loop subdocs
| _ -> None in
match find' filter [] document with
| Some x -> x
| None -> raise Not_found
(** [ parse_tags_at_same_level tag doc] looks down from the passed document
and finds the first matching tag, then returns a list of
matching tags with the same parent as the first match. *)
let parse_tags_at_same_level tag document =
let is_tag = match_tag tag in
let first_tr, parents = find_with_parent (function _ -> is_tag)
document in
let siblings = match List.hd parents with
Element (_,_,x) -> x | _ -> assert false in
List.filter is_tag siblings
(** [parse_trs doc] calls parse_tags_at_same_level matching "tr" tags *)
let parse_trs = parse_tags_at_same_level "tr"
(** [parse_tds doc] calls parse_tags_at_same_level matching "td" tags *)
let parse_tds = parse_tags_at_same_level "td"
let get_tag_with_name tag name document =
find (
fun el ->
try match_tag tag el && (attribute el "name")=name
with Not_found -> false
) document
(* form handling *)
let decode = Netencoding.Html.decode_to_latin1
let get_form = get_tag_with_name "form"
let get_select = get_tag_with_name "select"
let select_selection selection =
try
(* look for a "selected" value *)
find (fun el ->
try ignore (attribute el "selected"); true
with Not_found-> false) selection
with Not_found ->
(* otherwise, take first option element *)
find (match_tag "option") selection
let select_value selection =
decode (attribute (select_selection selection) "value")
let select_label selection =
decode (as_text (select_selection selection))
let get_select_value form name =
select_value (get_select name form)
let get_select_label form name =
select_label (get_select name form)
let input_value input =
decode (attribute input "value")
let get_input_value form name =
input_value (get_tag_with_name "input" name form)
let textarea_value textarea =
decode (as_text_formatted textarea)
let get_textarea_value form name =
textarea_value (get_tag_with_name "textarea" name form)
let element_name default el = try attribute el "name"
with Not_found -> default
let list_forms document =
list_tags "form" document
let form_names document =
List.map (element_name "[No name]") (list_forms document)
let element_to_accessor = [
"select", select_value;
"input", input_value;
"textarea", textarea_value;
]
let get_field_list form =
List.rev (fold_left (fun current el ->
try
let accessor = List.assoc (tag_type el) element_to_accessor in
(* printf "%s: %s \n" (tag_type el) (element_name "noname" el); *)
( element_name "noname" el, accessor el ) :: current
with Not_found -> current
) [] form)
let print_field_list a =
printf "[\n";
List.iter (function name,value ->
printf " \"%s\",\"%s\";\n" (String.escaped name) (String.escaped value)
) a;
printf "]\n"