module Xmlm:Streaming XML IO.sig..end
A well-formed sequence of signals represents an XML document tree traversal in depth first order (this has nothing to do with XML well-formedness). Input pulls a well-formed sequence of signals from a data source and output pushes a well-formed sequence of signals to a data destination. Functions are provided to easily transform sequences of signals to/from arborescent data structures.
Consult the features and limitations and examples of use.
Release 1.1.1 - Daniel Bünzli <daniel.buenzli at erratique.ch>
    References
    
typeencoding =[ `ISO_8859_1 | `US_ASCII | `UTF_16 | `UTF_16BE | `UTF_16LE | `UTF_8 ]
typedtd =string option
typename =string * string
(uri,local). An empty uri represents a name without a
    namespace name, i.e. an unprefixed name 
    that is not under the scope of a default namespace.typeattribute =name * string
typetag =name * attribute list
typesignal =[ `Data of string | `Dtd of dtd | `El_end | `El_start of tag ]
doc grammar :
    doc ::= `Dtd tree
tree ::= `El_start child `El_end
child ::= `Data | tree | epsilon 
    Input and output deal only with well-formed sequences or
    exceptions are raised.val ns_xml : string
val ns_xmlns : string
typepos =int * int
typeerror =[ `Expected_char_seqs of string list * string
| `Expected_root_element
| `Illegal_char_ref of string
| `Illegal_char_seq of string
| `Malformed_char_stream
| `Max_buffer_size
| `Unexpected_eoi
| `Unknown_encoding of string
| `Unknown_entity_ref of string
| `Unknown_ns_prefix of string ]
val error_message : error -> stringexception Error of pos * error
typesource =[ `Channel of Pervasives.in_channel
| `Fun of unit -> int
| `String of int * string ]
`String starts reading at the
    given integer position. For `Fun the function must return the
    next byte as an int and raise End_of_file if there is no
    such byte.type 
val make_input : ?enc:encoding option ->
       ?strip:bool ->
       ?ns:(string -> string option) ->
       ?entity:(string -> string option) -> source -> inputenc, character encoding of the document,  details. 
       Defaults to None.strip, strips whitespace in character data,  details.
       Defaults to false.ns is called to bind undeclared namespace prefixes,
        details. Default returns always None.entity is called to resolve non predefined entity references,
        details. Default returns always None.val input : input -> signalXmlm.Error is raised. Furthermore there will be no
    two consecutive `Data signals in the sequence and their string
    is always non empty. After a well-formed sequence was input another may 
    be input, see Xmlm.eoi and details.
    Raises Xmlm.Error on input errors.
val input_tree : el:(tag -> 'a list -> 'a) -> data:(string -> 'a) -> input -> 'a`Data signal, inputs it and invokes data with the character data.`El_start signal, inputs the sequence of signals until its 
       matching `El_end and invokes el and data as follows
    el, is called on each `El_end signals with the corresponding 
      `El_start tag and the result of the callback invocation for the 
      element's children.data, is called on each `Data signals with the character data. 
      This function won't be called twice consecutively or with the empty 
      string.Invalid_argument.
    Raises Xmlm.Error on input errors and Invalid_argument
      if the next signal is not `El_start or `Data.
val input_doc_tree : el:(tag -> 'a list -> 'a) ->
       data:(string -> 'a) -> input -> dtd * 'aXmlm.input_tree but reads a complete well-formed  
    sequence of signals. 
    Raises Xmlm.Error on input errors and Invalid_argument
     if the next signal is not `Dtd.
val peek : input -> signalXmlm.input but doesn't remove the signal from the sequence. 
    Raises Xmlm.Error on input errors.
val eoi : input -> bool
val pos : input -> postype'afrag =[ `Data of string | `El of tag * 'a list ]
'a.typedest =[ `Buffer of Buffer.t
| `Channel of Pervasives.out_channel
| `Fun of int -> unit ]
`Buffer, the buffer won't
    be cleared. For `Fun the function is called with the output     bytes as ints.type 
val make_output : ?decl:bool ->
       ?nl:bool ->
       ?indent:int option ->
       ?ns_prefix:(string -> string option) -> dest -> outputdecl, if true the  XML
     declaration is output (defaults to true).nl, if true a newline is output when the root's element `El_end 
     signal is output.
    Defaults to false.indent, identation behaviour, see  details. Defaults to
      None.ns_prefix, undeclared namespace prefix bindings, 
       see details. Default returns always None.val output : output -> signal -> unit
    Raises Invalid_argument if the resulting signal sequence on
    the output abstraction is not well-formed or if a
    namespace name could not be bound to a prefix.
val output_depth : output -> intoutput_depth o is o's current element nesting level (undefined
    before the first `El_start and after the last `El_end).val output_tree : ('a -> 'a frag) -> output -> 'a -> unit
    Raises see Xmlm.output.
val output_doc_tree : ('a -> 'a frag) -> output -> dtd * 'a -> unitXmlm.output_tree but outputs a complete well-formed 
    sequence of signals.
    Raises see Xmlm.output.
    Xmlm.Make allows client to specify types for strings and internal
    buffers. Among other things this can be used to perform
    hash-consing or to process the character stream, e.g. to normalize
    unicode characters or to convert to a custom encoding.
typestd_string =string
typestd_buffer =Buffer.t
module type String =sig..end
module type Buffer =sig..end
module type S =sig..end
Xmlm.Make.
module Make:
    The module assumes strings are immutable, thus strings
    the client gives or receives during the input and output process 
    must not be modified.
    Input
    Encoding
    
The parser supports ASCII, US-ASCII, UTF-8, UTF-16, UTF-16LE, UTF-16BE and ISO-8559-1 (Latin-1) encoded documents. But strings returned by the library are always UTF-8 encoded (unless you use the functor).
    The encoding can be specified explicitly using the optional
    argument enc. Otherwise the parser uses UTF-16 or UTF-8 if there is a
    BOM at the
    beginning of the document. If there is no BOM it uses the encoding
    specified in the  XML
    declaration. Finally, if there is no XML declaration UTF-8 is assumed.
    White space handling
    The parser performs
    attribute data
    normalization on every attribute data.  This means that
    attribute data does not have leading and trailling white space and that 
    any white space is collapsed and transformed to a single space 
    character (U+0020).
    White space handling of character data depends on the strip
    argument. If strip is true, character data is treated like
    attribute data, white space before and after elements is removed
    and any white space is collapsed and transformed to a single
    space character (U+0020), except if the data is under the scope of a     xml:space attribute whose value is preserve.  If strip is
    false all white space data is preserved as present in the
    document (however all kinds of
    line ends are
    translated to the newline character (U+000A).  Namespaces
    Xmlm's names are
    expanded names.
    The parser automatically handles the document's namespace
    declarations.  Undeclared namespace prefixes can be bound via the
    callback ns, which must return a namespace name. If ns returns
    None an `Unknown_ns_prefix error is raised.
    Attributes used for namespace declarations are preserved by the
    parser. They are in the Xmlm.ns_xmlns namespace. Default namespace
    declarations made with xmlns have the attribute name
    (Xmlm.ns_xmlns, "xmlns"). Prefix declarations have the prefix as
    the local name, for example xmlns:ex results in the attribute name
    (Xmlm.ns_xmlns, "ex").
    Regarding constraints on the usage of the xml and xmlns
    prefixes by documents, the parser does not report errors on violations 
    of the must constraints listed in
    this paragraph. 
    Character and entity references
    Character references
    and predefined
    entities are automatically resolved. Other entity references can
    be resolved by the callback entity, which must return an UTF-8
    (unless you use the functor) string corresponding to the
    replacement character data.  The replacement data is not
    analysed for further references, it is added to the data as such
    modulo white space stripping. If entity returns None the error
    `Unknown_entity_ref is returned.    
    Sequences of documents
    When a well-formed sequence of signals is input, no data is consumed beyond
    the closing '>' of the document's root element. 
    If you want to parse a document as
    defined in the XML
    specification, call Xmlm.eoi after a well-formed sequence of
    signals, it must return true. If you expect another document on
    the same input abstraction a new well-formed sequence of signals
    can be Xmlm.input. Use Xmlm.eoi to check if a document follows (this
    may consume data).
    Invoking Xmlm.eoi after a well-formed sequence of signals skips
    whitespaces, comments and processing instructions until it gets to
    either an  XML
    declaration or a DTD
    or the start of a new element or the end of input (in which case
    Xmlm.eoi returns true).  If there is a new document but there is no
    XML declaration or the declaration specifies UTF-16, the same
    encoding as for the previous document is used.
    Miscellaneous
    
':' because
    of namespaces).Sys.max_string_length (unless you use the functor). 
       The error `Max_buffer_size is raised if the limit is hit.
    Output
 
    Encoding
 
    Outputs only  UTF-8
    encoded documents (even if you use the functor).  Strings given to
    output functions must be UTF-8 encoded (unless you use the
    functor, but you need to provide a translation), no checks are
    performed.  Namespaces
    Xmlm's names are
    expanded names.
    Expanded names are automatically converted to
    qualified
    names by the output abstraction. There is no particular api to specify 
    prefixes and default namespaces, 
    the actual result depends solely on the output
    of attributes belonging to the Xmlm.ns_xmlns namespace. For example to set 
    the default namespace of an element to http://example.org/myns, 
    use the following attribute :
    (* xmlns='http://example.org/myns' *)
    To bind the prefix 
let default_ns = (Xmlm.ns_xmlns, "xmlns"), "http://example.org/myns""ex" to http://example.org/ex, use the 
    following attribute :
    (* xmlns:ex='http://example.org/ex' *)
    Note that outputing input signals without
    touching namespace declaration attributes will preserve existing
    prefixes and bindings provided the same namespace name is not
    bound to different prefixes in a given context.
let ex_ns = (Xmlm.ns_xmlns, "ex"), "http://example.org/ex"
    The callback ns_prefix of an output abstraction can be used to
    give a prefix to a namespace name lacking a prefix binding in the
    current output scope. Given a namespace name the function must return 
    the prefix to use. Note that this
    will not add any namespace declaration attribute to the
    output.  If the function returns None, Xmlm.output will raise
    Invalid_argument.  The default function returns always None.
    Indentation
    Output can be indented by specifying the indent argument when an
       output abstraction is created. If indent is None (default)
       signal output does not introduce any extra white space.  If
       ident is Some c, each Xmlm.signal is output on its own line
       (for empty elements `El_start and `El_end are collapsed on a single
       line) and nested elements are indented with c space
       characters.
    Sequences of documents
 
After a well-formed sequence of signals was output, the output abstraction can be reused to output a new well-formed sequence of signals.
    Miscellaneous
    
'<','>','&', and '\"' are 
        automatically escaped to 
        predefined
        entities.("","dip d") will produce 
      a non well-formed document because of the space character.
    Tips
 
    
strip = false and output with indent = None.indent = None and suitable `Data signals
    Examples
 
    Sequential processing
    
Sequential processing has the advantage that you don't need to get the whole document tree in memory to process it.
    The following function reads a single document on an
    input channel and outputs it.
let id ic oc = 
  let i = Xmlm.make_input (`Channel ic) in 
  let o = Xmlm.make_output (`Channel oc) in 
  let rec pull i o depth = 
    Xmlm.output o (Xmlm.peek i);
    match Xmlm.input i with 
    | `El_start _ -> pull i o (depth + 1)
    | `El_end -> if depth = 1 then () else pull i o (depth - 1)
    | `Data _ -> pull i o depth 
    | `Dtd _ -> assert false
  in
  Xmlm.output o (Xmlm.input i); (* `Dtd *)
  pull i o 0;
  if not (Xmlm.eoi i) then invalid_arg "document not well-formed"
    The following function reads a sequence of documents on an
    input channel and outputs it.
let id_seq ic oc = 
    The following function reads a sequence of documents on the 
    input channel. In each document's tree it prunes non root elements
    whose name belongs to 
  let i = Xmlm.make_input (`Channel ic) in 
  let o = Xmlm.make_output ~nl:true (`Channel oc) in 
  while not (Xmlm.eoi i) do Xmlm.output o (Xmlm.input i) doneprune_list.
let prune_docs prune_list ic oc = 
  let i = Xmlm.make_input (`Channel ic) in
  let o = Xmlm.make_output ~nl:true (`Channel oc) in
  let copy i o = Xmlm.output o (Xmlm.input i) in
  let prune (name, _) = List.mem name prune_list in
  let rec process i o d = 
    let rec skip i d = match Xmlm.input i with
    | `El_start _ -> skip i (d + 1)
    | `El_end -> if d = 1 then () else skip i (d - 1)
    | s -> skip i d
    in
    match Xmlm.peek i with 
    | `El_start tag when prune tag -> skip i 0; process i o d
    | `El_start _ -> copy i o; process i o (d + 1)
    | `El_end -> copy i o; if d = 0 then () else process i o (d - 1)
    | `Data _ -> copy i o; process i o d
    | `Dtd _ -> assert false
  in
  let rec docs i o = 
    copy i o; (* `Dtd *)
    copy i o; (* root start *)
    process i o 0;
    if Xmlm.eoi i then () else docs i o
  in
  docs i o
    Tree processing
 
    A document's sequence of signals can be easily converted
    to an arborescent data structure. Assume your trees are defined by :
    type tree = E of Xmlm.tag * tree list | D of string
    The following functions input/output xml documents from/to abstractions 
    as value of type tree.
let in_tree i =  
  let el tag childs = E (tag, childs)  in
  let data d = D d in
  Xmlm.input_doc_tree ~el ~data i
let out_tree o t = 
  let frag = function
  | E (tag, childs) -> `El (tag, childs) 
  | D d -> `Data d 
  in
  Xmlm.output_doc_tree frag o t
    Tabular data processing
We show how to process XML data that represents tabular data (some people like do that).
The file we need to deal with represents nominal data about W3C bureaucrats. There are no namespaces and attributes are ignored. The element structure of the document is :
A bureaucrat contains the following elements, in order.
    In OCaml we represent a W3C bureaucrat by this type :
type w3c_bureaucrat = { 
    The following functions input and output W3C bureaucrats as lists
    of values of type 
    name : string; 
    surname : string; 
    honest : bool; 
    obfuscation_level : float;
    trs : string list; }w3c_bureaucrat.
let in_w3c_bureaucrats src = 
  let i = Xmlm.make_input ~strip:true src in
  let tag n = ("", n), [] in
  let error () = invalid_arg "parse error" in
  let accept s i = if Xmlm.input i = s then () else error () in
  let rec i_seq el acc i = match Xmlm.peek i with 
  | `El_start _ -> i_seq el ((el i) :: acc) i
  | `El_end -> List.rev acc
  | _ -> error ()
  in
  let i_el n i = 
    accept (`El_start (tag n)) i;
    let d = match Xmlm.peek i with
    | `Data d -> ignore (Xmlm.input i); d
    | `El_end -> ""
    | _ -> error ()
    in
    accept (`El_end) i;
    d
  in
  let i_bureaucrat i = 
    try
      accept (`El_start (tag "bureaucrat")) i;
      let name = i_el "name" i in
      let surname = i_el "surname" i in
      let honest = match Xmlm.peek i with
      | `El_start (("", "honest"), []) -> ignore (i_el "honest" i); true
      | _ -> false
      in
      let obf = float_of_string (i_el "obfuscation_level" i) in
      let trs = i_seq (i_el "tr") [] i in
      accept (`El_end) i;
      { name = name; surname = surname; honest = honest; 
        obfuscation_level = obf; trs = trs }
    with
    | Failure _ -> error () (* float_of_string *)
  in
  accept (`Dtd None) i;
  accept (`El_start (tag "list")) i;
  let bl = i_seq i_bureaucrat [] i in
  accept (`El_end) i;
  if not (Xmlm.eoi i) then invalid_arg "more than one document";
  bl
let out_w3c_bureaucrats dst bl = 
  let tag n = ("", n), [] in
  let o = Xmlm.make_output ~nl:true ~indent:(Some 2) dst in
  let out = Xmlm.output o in
  let o_el n d = 
    out (`El_start (tag n)); 
    if d <> "" then out (`Data d); 
    out `El_end 
  in
  let o_bureaucrat b = 
    out (`El_start (tag "bureaucrat"));
    o_el "name" b.name;
    o_el "surname" b.surname;
    if b.honest then o_el "honest" "";
    o_el "obfuscation_level" (string_of_float b.obfuscation_level);
    List.iter (o_el "tr") b.trs;
    out `El_end
  in
  out (`Dtd None);
  out (`El_start (tag "list"));
  List.iter o_bureaucrat bl;
  out (`El_end)