Here's an sxml style parser based on Rich's original xml.clj, in case
anyone is interested.
It's not meant to compete with xml.clj, its main purpose is to help
with a port of sxpath/sxslt if I ever get around to it. It's just
like sxml except the attributes are clojure maps. It does handle
mixed content (I see Rich has added that too in SVN, great!) and
attempts to do the right thing by optionally ignoring whitespace
between elements.
Produced content looks like this:
(*top*
(account {:title "Savings 1"}
(ownerid "12398")
(balance {:currency "USD"} "3212.12")
(descr-html "Main " (b "short term savings") " account.")))
PS: I'm sure the source will be garbled by randomly placed line
breaks. Is there a way to upload files here? I was able to see the
"Files" page but I don't see an upload link...
; Copyright (c) Rich Hickey. All rights reserved.
; The use and distribution terms for this software are covered by
the
; Common Public License 1.0 (
http://opensource.org/licenses/cpl.php)
; which can be found in the file CPL.TXT at the root of this
distribution.
; By using this software in any fashion, you are agreeing to be
bound by
; the terms of this license.
; You must not remove this notice, or any other, from this software.
(in-ns 'xml)
(clojure/refer 'clojure)
(import '(org.xml.sax ContentHandler Attributes SAXException)
'(javax.xml.parsers SAXParser SAXParserFactory)
'(org.xml.sax InputSource))
(def *stack*)
(def *current*)
(def *pending-chars*)
(def *state*)
(defn finalize-element [e] (reverse e))
(defn add-pending-char-data []
(set! *current* (conj *current* (str *pending-chars*)))
(set! *pending-chars* nil))
(defn all-whitespace? [chars-array start len]
(loop [i (+ start (dec len))]
(if (< i start)
true
(if (not (. Character (isWhitespace (aget chars-array i))))
false
(recur (dec i))))))
(defn content-handler [opts]
(new clojure.lang.XMLHandler
(implement [ContentHandler]
(startElement [uri local-name q-name #^Attributes atts]
(let [make-attrs (fn [ret i]
(if (neg? i)
ret
(recur (assoc ret
(. clojure.lang.Keyword (intern (symbol (. atts (getQName
i)))))
(. atts (getValue i)))
(dec i))))
attrs (make-attrs {} (dec (. atts (getLength))))
new-el (if (. attrs (isEmpty))
(list (symbol q-name))
(list attrs (symbol q-name)))]
(when *pending-chars*
(let [ignore (and (:ignore-whitespace-between-elements opts)
(or (= *state* :ws-read-after-element-start)
(= *state* :ws-read-after-element-end))) ]
(if ignore
(set! *pending-chars* nil)
(add-pending-char-data))))
(set! *stack* (conj *stack* *current*))
(set! *current* new-el)
(set! *state* :element-started))
nil)
(endElement [uri local-name q-name]
(when *pending-chars*
(let [ignore (and (:ignore-whitespace-between-elements opts)
(= *state* :ws-read-after-element-end)) ]
(if ignore
(set! *pending-chars* nil)
(add-pending-char-data))))
(set! *current* (conj (peek *stack*) (finalize-element
*current*)))
(set! *stack* (pop *stack*))
(set! *state* :element-ended)
nil)
(characters [cdata start len]
(when-not *pending-chars*
(set! *pending-chars* (new StringBuilder)))
(let [#^StringBuilder sb *pending-chars*]
(. sb (append cdata start len))
(set! *state*
(if (and (:ignore-whitespace-between-elements opts)
(all-whitespace? cdata start len))
(cond
(or (= *state* :element-started) (= *state* :ws-read-after-
element-start)) :ws-read-after-element-start
(or (= *state* :element-ended) (= *state* :ws-read-after-
element-end)) :ws-read-after-element-end
true :chars-read)
:chars-read)))
nil))))
;; TODO:
;; Add option: :validating (in which case tell parser to ignore
ignorable whitespace).
;; Make parser namespace aware (test - what's the difference?)
(defn parse
([s] (parse s {:ignore-whitespace-between-elements true}))
([s opts]
(let [p (.. SAXParserFactory (newInstance) (newSAXParser))]
(binding [*stack* nil
*current* '(*top*)
*state* nil
*pending-chars* nil]
(. p (parse (new InputSource s) (content-handler opts)))
(finalize-element *current*)))))
(import '(
java.io StringReader))
(defn test1 []
(let [ cxml '(*top*
(account {:title "Savings 1"}
(ownerid "12398")
(balance {:currency "USD"} "3212.12")
(descr-html "Main " (b "short term savings") " account.")))
xml (str "<account title='Savings 1'>"
"<ownerid>12398</ownerid>"
"<balance currency=\"USD\">3212.12</balance>"
"<descr-html>Main <b>short term savings</b> account.</descr-
html>"
"</account>") ]
(assert (= cxml (parse (new StringReader xml))))
println "Test succeeded."))