
/* xml.q: libxml/libxslt interface
   $Id: xml.q,v 1.8 2006/04/25 18:13:03 agraef Exp $ */

/* This file is part of the Q programming system.

   The Q programming system is free software; you can redistribute it and/or
   modify it under the terms of the GNU General Public License as published by
   the Free Software Foundation; either version 2, or (at your option) any
   later version.

   The Q programming system is distributed in the hope that it will be
   useful, but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program; if not, write to the Free Software
   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */

/* Basic XML/XSLT interface using libxml2 and libxslt. Please send bug
   reports to Dr.Graef@t-online.de. */

/* Manifest constants (parser flags). */

public var const XML_DTDLOAD, XML_DTDVALID, XML_PEDANTIC, XML_SUBENT,
  XML_NOBLANKS;

def XML_DTDLOAD	= 0x01, // load DTD
  XML_DTDVALID	= 0x02, // validate
  XML_PEDANTIC	= 0x04, // pedantic parse
  XML_SUBENT	= 0x08, // substitute entities
  XML_NOBLANKS	= 0x10; // suppress blank nodes

/* External objects used to represent parsed documents and their nodes. */

public extern type XMLDoc;
public extern type XMLNode;

public is_xml_doc X, is_xml_node X;

is_xml_doc _:XMLDoc		= true;
is_xml_doc _			= false otherwise;

is_xml_node _:XMLNode		= true;
is_xml_node _			= false otherwise;

/* An XML document is a rooted tree which can be created, traversed and
   manipulated using the operations of this module. There are different types
   of nodes in the tree, each carrying their own type of data. In Q land, the
   node data is described using the XMLNodeInfo type defined below. The
   following document node types are currently recognized:

   - element TAG NS ATTRS: an XML element with given (possibly qualified) name
     (tag), namespace declarations (list of (PREFIX,HREF) pairs) and
     attributes (list of (KEY,VALUE) pairs)

   - element_text TAG NS ATTRS CONTENT: a convenience function which denotes a
     combination of an element node with a text child; this is only used when
     creating a new node, and indicates that a text node child is to be added
     to the node automatically

   - attr KEY VAL: an attribute node; these only occur as results of
     xml_select and xml_attrs, and cannot be inserted directly into a document

   - text CONTENT: text node with given content (a string)

   - cdata CONTENT: like text, but contains unparsed character data

   - comment COMMENT: a comment

   - entity_ref NAME: entity reference (&name;)

   - pi NAME PI: processing instructions (NAME is the application name, PI
     the text of the processing instructions)

   Besides these, there is a number of different node types only used in the
   document type definition (DTD), which can be extracted from a document
   using the xml_int_subset and xml_ext_subset functions. These are for
   inspection purposes only; it is not possible to change the DTD of a
   document in-place. (However, you can create a new document, and attach a
   DTD to it, using the xml_new_doc function.) */

public type XMLNodeInfo = const
  // document nodes
  element TAG NS ATTRS,
  element_text TAG NS ATTRS CONTENT,
  attr NAME VAL,
  text CONTENT, cdata CONTENT, comment COMMENT,
  entity_ref NAME, pi NAME PI,

  // DTD nodes
  doctype NAME EXTID,

  // element declarations
  undefined_element NAME,
  empty_element NAME,
  any_element NAME,
  mixed_element NAME CONTENT,
  std_element NAME CONTENT,

  // attribute declarations
  cdata_attr ELEM_NAME NAME DEFAULT,
  id_attr ELEM_NAME NAME DEFAULT,
  idref_attr ELEM_NAME NAME DEFAULT,
  idrefs_attr ELEM_NAME NAME DEFAULT,
  entity_attr ELEM_NAME NAME DEFAULT,
  entities_attr ELEM_NAME NAME DEFAULT,
  nmtoken_attr ELEM_NAME NAME DEFAULT,
  nmtokens_attr ELEM_NAME NAME DEFAULT,
  enum_attr ELEM_NAME NAME VALS DEFAULT,
  notation_attr ELEM_NAME NAME VALS DEFAULT,

  // entity declarations
  int_entity NAME CONTENT,
  int_param_entity NAME CONTENT,
  ext_entity NAME EXTID CONTENT,
  ext_param_entity NAME EXTID CONTENT;

/* Element content type (CONTENT argument of *_element). */

public type XMLElementContent = const pcdata, sequence Xs, union Xs,
  opt X, mult X, plus X;

/* Attribute defaults (DEFAULT argument of *_attr). */

public type XMLAttrDefault = const required, implied, default VAL, fixed VAL;

/* Internal helper ops. */

private mksequence X Y, mkunion X Y;

mksequence (sequence Xs) (sequence Ys)
				= sequence (Xs++Ys);
mksequence (sequence Xs) Y	= sequence (append Xs Y);
mksequence X (sequence Ys)	= sequence (cons X Ys);
mksequence X Y			= sequence [X,Y] otherwise;

mkunion (union Xs) (union Ys)	= union (Xs++Ys);
mkunion (union Xs) Y		= union (append Xs Y);
mkunion X (union Ys)		= union (cons X Ys);
mkunion X Y			= union [X,Y] otherwise;

/* Create a new XML document. Returns the XMLDoc object. VERSION is a string
   denoting the XML version (or () to indicate the default). INFO is the data
   of the root node (which should denote an element node). DTD denotes the
   document type which can be () to denote an empty DTD, a string (the URI/
   filename of the DTD), or a pair (PUBID,SYSID) where PUBID denotes the
   public identifier of the DTD and SYSID its system identifier (URI). */

/* Note that only simple kinds of documents with an internal DTD can be
   created this way. Use the xml_load_file or xml_load_string function below
   to create a skeleton document if a more elaborate prolog is required. */

public extern xml_new_doc VERSION DTD INFO;

/* Load an XML document from a file or a string. FLAGS denotes the parser
   flags (a bitwise disjunction of any of the XML_* constants defined at the
   beginning of this module, or 0 for the default). Returns the XMLDoc object
   (fails if there is a fatal error parsing the document). */

public extern xml_load_file NAME FLAGS;
public extern xml_load_string S FLAGS;

/* Save an XML document DOC to a file or a string. When saving to a file,
   ENCODING denotes the desired encoding (or () for the default), COMPRESSION
   the desired level of zlib compression (0 means none, 9 is maximum, ()
   indicates the default). Note that with xml_save_string, the result is
   always encoded as UTF-8. */

public extern xml_save_file NAME DOC ENCODING COMPRESSION;
public extern xml_save_string DOC;

/* Retrieve general information about a document. Returns a tuple (VERSION,
   ENCODING,URL,COMPRESSION,STANDALONE), where VERSION is the XML version of
   the document, ENCODING the external encoding (if any), URL the
   name/location of the document (if any), COMPRESSION the level of zlib
   compression, and STANDALONE is a Bool value which corresponds to the
   "standalone" attribute in the XML document header. */

public extern xml_doc_info DOC;

/* Retrieve the internal and external DTD subset of a document. Returns a
   doctype node (fails if there's no corresponding dtd). */

public extern xml_int_subset DOC, xml_ext_subset DOC;

/* Traverse the document tree, i.e., the nodes of the document. The root node
   can be retrieved from an XMLDoc object DOC with xml_root, which returns an
   XMLNode object. Given an XMLNode object NODE, you can then obtain the
   document it belongs to with xml_doc, its parent node with xml_parent, its
   first and last child node with xml_first and xml_last, the next and
   previous sibling of the node with xml_next and xml_prev, and the first and
   last attribute node with xml_first_attr and xml_last_attr respectively.
   All these operations fail if the corresponding target node does not exist,
   or if the type of the given node is not supported by this
   implementation. */

public extern xml_root DOC;
public extern xml_doc NODE;
public extern xml_parent NODE;
public extern xml_first NODE, xml_last NODE;
public extern xml_next NODE, xml_prev NODE;
public extern xml_first_attr NODE, xml_last_attr NODE;

/* Convenience functions to retrieve the children and attribute nodes of a
   node. */

public xml_children NODE, xml_attrs NODE;

xml_children NODE:XMLNode	= while is_xml_node xml_next (xml_first NODE);
xml_attrs NODE:XMLNode		= while is_xml_node xml_next
				  (xml_first_attr NODE);

NODE:XMLNode!I:Int		= CHILD
				    where CHILD:XMLNode = xml_children NODE!I;

/* Retrieve nodes using an XPath specification. Given an XPath (a string)
   XPATH, this operation returns the list of all matching nodes in the given
   document DOC. */

public extern xml_select DOC XPATH;

/* Retrieve the node data. Returns an XMLNodeInfo value. Fails if NODE does
   not belong to one of the supported node types. */

public extern xml_node_info NODE;

/* Additional node operations. The xml_is_blank_node function checks whether a
   node is a blank node (empty or whitespace only) and thus possibly
   ignorable. The xml_node_base function returns the base URI of the given
   node, xml_node_path its path (in the format accepted by xml_xpath), and
   xml_node_content returns the text carried by the node, if any (after entity
   substitution). For an element node, xml_node_attr can be used to retrieve
   the value of the given attribute (after entity substitution),
   xml_set_node_attr and xml_unset_node_attr to set and unset an attribute
   value. */

public extern xml_is_blank_node NODE;
public extern xml_node_base NODE;
public extern xml_node_path NODE;
public extern xml_node_content NODE;

public extern xml_node_attr NODE KEY;
public extern xml_set_node_attr NODE KEY VAL;
public extern xml_unset_node_attr NODE KEY;

/* Add a new node to the document tree, given the node data (XMLNodeInfo
   value) INFO. The new node can be added either in place of a given node NODE
   (xml_replace), as NODE's first or last child (xml_add_first, xml_add_last),
   or as its next or previous sibling (xml_add_next, xml_add_prev). Returns
   the new XMLNode object. */

public extern xml_replace NODE INFO;
public extern xml_add_first NODE INFO, xml_add_last NODE INFO;
public extern xml_add_next NODE INFO, xml_add_prev NODE INFO;

/* Delete an existing node from the document tree. */

public extern xml_unlink NODE;

/* Basic XSLT support. Stylesheets are represented as objects of the
   XSLTStylesheet type. Note that, in difference to the XMLDoc type, this is
   an opaque type, i.e., there is no direct means to inspect and manipulate
   parsed stylesheets in memory. However, you can achieve this by reading them
   as XMLDoc objects. This is possible because a stylesheet is just a special
   kind of XML document. The xslt_load_stylesheet function then allows you to
   convert this document to an XSLTStylesheet object.

   Applying a stylesheet to an XML document involves the following steps:

   1. Load and parse the stylesheet using xslt_load_stylesheet. The parameter
      to xslt_load_stylesheet can be either the name of a stylesheet file or a
      corresponding XMLDoc object. The function returns an XSLTStylesheet
      object which is used in the subsequent processing.

   2. Invoke xslt_apply_stylesheet on the stylesheet and the XMLDoc
      object. This returns a new XMLDoc object containing the transformed
      document. The xslt_apply_stylesheet function also accepts additional
      parameters in the form of a singleton or a tuple of (KEY,VALUE) string
      pairs, which allows you to pass additional information to a stylesheet.

   3. Run xslt_save_result_file or xslt_save_result_string on the result and
      the stylesheet to save the transformed document in a file or a
      string. (You could also save the result using one of the XML output
      operations, but then some output-related information contained in the
      stylesheet might be lost.) */

public extern type XSLTStylesheet;

public is_xslt_stylesheet X;

is_xslt_stylesheet _:XSLTStylesheet
				= true;
is_xslt_stylesheet _		= false otherwise;

public extern xslt_load_stylesheet DOC;

public extern xslt_apply_stylesheet STYLE DOC PARAMS;

public extern xslt_save_result_file NAME DOC STYLE COMPRESSION;
public extern xslt_save_result_string DOC STYLE;
