View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2000-2016, University of Amsterdam
   7                              VU University Amsterdam
   8    All rights reserved.
   9
  10    Redistribution and use in source and binary forms, with or without
  11    modification, are permitted provided that the following conditions
  12    are met:
  13
  14    1. Redistributions of source code must retain the above copyright
  15       notice, this list of conditions and the following disclaimer.
  16
  17    2. Redistributions in binary form must reproduce the above copyright
  18       notice, this list of conditions and the following disclaimer in
  19       the documentation and/or other materials provided with the
  20       distribution.
  21
  22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33    POSSIBILITY OF SUCH DAMAGE.
  34*/
  35
  36:- module(sgml,
  37          [ load_html/3,                % +Input, -DOM, +Options
  38            load_xml/3,                 % +Input, -DOM, +Options
  39            load_sgml/3,                % +Input, -DOM, +Options
  40
  41            load_sgml_file/2,           % +File, -ListOfContent
  42            load_xml_file/2,            % +File, -ListOfContent
  43            load_html_file/2,           % +File, -Document
  44
  45            load_structure/3,           % +File, -Term, +Options
  46
  47            load_dtd/2,                 % +DTD, +File
  48            load_dtd/3,                 % +DTD, +File, +Options
  49            dtd/2,                      % +Type, -DTD
  50            dtd_property/2,             % +DTD, ?Property
  51
  52            new_dtd/2,                  % +Doctype, -DTD
  53            free_dtd/1,                 % +DTD
  54            open_dtd/3,                 % +DTD, +Options, -Stream
  55
  56            new_sgml_parser/2,          % -Parser, +Options
  57            free_sgml_parser/1,         % +Parser
  58            set_sgml_parser/2,          % +Parser, +Options
  59            get_sgml_parser/2,          % +Parser, +Options
  60            sgml_parse/2,               % +Parser, +Options
  61
  62            sgml_register_catalog_file/2, % +File, +StartOrEnd
  63
  64            xml_quote_attribute/3,      % +In, -Quoted, +Encoding
  65            xml_quote_cdata/3,          % +In, -Quoted, +Encoding
  66            xml_quote_attribute/2,      % +In, -Quoted
  67            xml_quote_cdata/2,          % +In, -Quoted
  68            xml_name/1,                 % +In
  69            xml_name/2,                 % +In, +Encoding
  70
  71            xsd_number_string/2,        % ?Number, ?String
  72            xsd_time_string/3,          % ?Term, ?Type, ?String
  73
  74            xml_basechar/1,             % +Code
  75            xml_ideographic/1,          % +Code
  76            xml_combining_char/1,       % +Code
  77            xml_digit/1,                % +Code
  78            xml_extender/1,             % +Code
  79
  80            iri_xml_namespace/2,        % +IRI, -Namespace
  81            iri_xml_namespace/3,        % +IRI, -Namespace, -LocalName
  82            xml_is_dom/1                % +Term
  83          ]).
  84:- use_module(library(lists)).
  85:- use_module(library(option)).
  86:- use_module(library(error)).
  87:- use_module(library(iostream)).
  88
  89:- meta_predicate
  90    load_structure(+, -, :),
  91    load_html(+, -, :),
  92    load_xml(+, -, :),
  93    load_sgml(+, -, :).
  94
  95:- predicate_options(load_structure/3, 3,
  96                     [ charpos(integer),
  97                       defaults(boolean),
  98                       dialect(oneof([html,html4,html5,sgml,xhtml,xhtml5,xml,xmlns])),
  99                       doctype(atom),
 100                       dtd(any),
 101                       encoding(oneof(['iso-8859-1', 'utf-8', 'us-ascii'])),
 102                       entity(atom,atom),
 103                       keep_prefix(boolean),
 104                       file(atom),
 105                       line(integer),
 106                       offset(integer),
 107                       number(oneof([token,integer])),
 108                       qualify_attributes(boolean),
 109                       shorttag(boolean),
 110                       case_sensitive_attributes(boolean),
 111                       case_preserving_attributes(boolean),
 112                       system_entities(boolean),
 113                       max_memory(integer),
 114                       space(oneof([sgml,preserve,default,remove])),
 115                       xmlns(atom),
 116                       xmlns(atom,atom),
 117                       pass_to(sgml_parse/2, 2)
 118                     ]).
 119:- predicate_options(load_html/3, 3,
 120                     [ pass_to(load_structure/3, 3)
 121                     ]).
 122:- predicate_options(load_xml/3, 3,
 123                     [ pass_to(load_structure/3, 3)
 124                     ]).
 125:- predicate_options(load_sgml/3, 3,
 126                     [ pass_to(load_structure/3, 3)
 127                     ]).
 128:- predicate_options(load_dtd/3, 3,
 129                     [ dialect(oneof([sgml,xml,xmlns])),
 130                       pass_to(open/4, 4)
 131                     ]).
 132:- predicate_options(sgml_parse/2, 2,
 133                     [ call(oneof([begin,end,cdata,pi,decl,error,xmlns,urlns]),
 134                            callable),
 135                       content_length(integer),
 136                       document(-any),
 137                       max_errors(integer),
 138                       parse(oneof([file,element,content,declaration,input])),
 139                       source(any),
 140                       syntax_errors(oneof([quiet,print,style])),
 141                       xml_no_ns(oneof([error,quiet]))
 142                     ]).
 143:- predicate_options(new_sgml_parser/2, 2,
 144                     [ dtd(any)
 145                     ]).
 146
 147
 148/** <module> SGML, XML and HTML parser
 149
 150This library allows you to parse SGML, XML   and HTML data into a Prolog
 151data structure. The library defines several families of predicates:
 152
 153  $ High-level predicates :
 154  Most users will only use load_html/3, load_xml/3 or load_sgml/3 to
 155  parse arbitrary input into a _DOM_ structure.  These predicates all
 156  call load_structure/3, which provides more options and may be
 157  used for processing non-standard documents.
 158
 159  The DOM structure can be used by library(xpath) to extract information
 160  from the document.
 161
 162  $ The low-level parser :
 163  The actual parser is written in C and consists of two parts: one for
 164  processing DTD (Document Type Definitions) and one for parsing data.
 165  The data can either be parsed to a Prolog (_DOM_) term or the parser
 166  can perform callbacks for the DOM _events_.
 167
 168  $ Utility predicates :
 169  Finally, this library provides prmitives for classifying characters
 170  and strings according to the XML specification such as xml_name/1 to
 171  verify whether an atom is a valid XML name (identifier).  It also
 172  provides primitives to quote attributes and CDATA elements.
 173*/
 174
 175:- multifile user:file_search_path/2.
 176:- dynamic   user:file_search_path/2.
 177
 178user:file_search_path(dtd, '.').
 179user:file_search_path(dtd, swi('library/DTD')).
 180
 181sgml_register_catalog_file(File, Location) :-
 182    prolog_to_os_filename(File, OsFile),
 183    '_sgml_register_catalog_file'(OsFile, Location).
 184
 185:- use_foreign_library(foreign(sgml2pl)).
 186
 187register_catalog(Base) :-
 188    absolute_file_name(dtd(Base),
 189                           [ extensions([soc]),
 190                             access(read),
 191                             file_errors(fail)
 192                           ],
 193                           SocFile),
 194    sgml_register_catalog_file(SocFile, end).
 195
 196:- initialization
 197    ignore(register_catalog('HTML4')).
 198
 199
 200                 /*******************************
 201                 *         DTD HANDLING         *
 202                 *******************************/
 203
 204/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 205Note that concurrent access to DTD objects  is not allowed, and hence we
 206will allocate and destroy them in each   thread.  Possibibly it would be
 207nicer to find out why  concurrent  access   to  DTD's  is  flawed. It is
 208diagnosed to mess with the entity resolution by Fabien Todescato.
 209- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 210
 211:- thread_local
 212    current_dtd/2.
 213:- volatile
 214    current_dtd/2.
 215:- thread_local
 216    registered_cleanup/0.
 217:- volatile
 218    registered_cleanup/0.
 219
 220:- multifile
 221    dtd_alias/2.
 222
 223:- create_prolog_flag(html_dialect, html5, [type(atom)]).
 224
 225dtd_alias(html4, 'HTML4').
 226dtd_alias(html5, 'HTML5').
 227dtd_alias(html,  DTD) :-
 228    current_prolog_flag(html_dialect, Dialect),
 229    dtd_alias(Dialect, DTD).
 230
 231%!  dtd(+Type, -DTD) is det.
 232%
 233%   DTD is a DTD object created from  the file dtd(Type). Loaded DTD
 234%   objects are cached. Note that  DTD   objects  may  not be shared
 235%   between threads. Therefore, dtd/2  maintains   the  pool  of DTD
 236%   objects  using  a  thread_local  predicate.    DTD  objects  are
 237%   destroyed if a thread terminates.
 238%
 239%   @error existence_error(source_sink, dtd(Type))
 240
 241dtd(Type, DTD) :-
 242    current_dtd(Type, DTD),
 243    !.
 244dtd(Type, DTD) :-
 245    new_dtd(Type, DTD),
 246    (   dtd_alias(Type, Base)
 247    ->  true
 248    ;   Base = Type
 249    ),
 250    absolute_file_name(dtd(Base),
 251                       [ extensions([dtd]),
 252                         access(read)
 253                       ], DtdFile),
 254    load_dtd(DTD, DtdFile),
 255    register_cleanup,
 256    asserta(current_dtd(Type, DTD)).
 257
 258%!  load_dtd(+DTD, +DtdFile, +Options)
 259%
 260%   Load DtdFile into a DTD.  Defined options are:
 261%
 262%           * dialect(+Dialect)
 263%           Dialect to use (xml, xmlns, sgml)
 264%
 265%           * encoding(+Encoding)
 266%           Encoding of DTD file
 267%
 268%   @param  DTD is a fresh DTD object, normally created using
 269%           new_dtd/1.
 270
 271load_dtd(DTD, DtdFile) :-
 272    load_dtd(DTD, DtdFile, []).
 273load_dtd(DTD, DtdFile, Options) :-
 274    sgml_open_options(sgml:Options, OpenOptions, sgml:DTDOptions),
 275    setup_call_cleanup(
 276        open_dtd(DTD, DTDOptions, DtdOut),
 277        setup_call_cleanup(
 278            open(DtdFile, read, DtdIn, OpenOptions),
 279            copy_stream_data(DtdIn, DtdOut),
 280            close(DtdIn)),
 281        close(DtdOut)).
 282
 283split_dtd_options([], [], []).
 284split_dtd_options([H|T], [H|TD], S) :-
 285    dtd_option(H),
 286    !,
 287    split_dtd_options(T, TD, S).
 288split_dtd_options([H|T], TD, [H|S]) :-
 289    split_dtd_options(T, TD, S).
 290
 291dtd_option(dialect(_)).
 292
 293
 294%!  destroy_dtds
 295%
 296%   Destroy  DTDs  cached  by  this  thread   as  they  will  become
 297%   unreachable anyway.
 298
 299destroy_dtds :-
 300    (   current_dtd(_Type, DTD),
 301        free_dtd(DTD),
 302        fail
 303    ;   true
 304    ).
 305
 306%!  register_cleanup
 307%
 308%   Register cleanup of DTDs created for this thread.
 309
 310register_cleanup :-
 311    registered_cleanup,
 312    !.
 313register_cleanup :-
 314    catch(thread_at_exit(destroy_dtds), _, true),
 315    assert(registered_cleanup).
 316
 317
 318                 /*******************************
 319                 *          EXAMINE DTD         *
 320                 *******************************/
 321
 322prop(doctype(_), _).
 323prop(elements(_), _).
 324prop(entities(_), _).
 325prop(notations(_), _).
 326prop(entity(E, _), DTD) :-
 327    (   nonvar(E)
 328    ->  true
 329    ;   '$dtd_property'(DTD, entities(EL)),
 330        member(E, EL)
 331    ).
 332prop(element(E, _, _), DTD) :-
 333    (   nonvar(E)
 334    ->  true
 335    ;   '$dtd_property'(DTD, elements(EL)),
 336        member(E, EL)
 337    ).
 338prop(attributes(E, _), DTD) :-
 339    (   nonvar(E)
 340    ->  true
 341    ;   '$dtd_property'(DTD, elements(EL)),
 342        member(E, EL)
 343    ).
 344prop(attribute(E, A, _, _), DTD) :-
 345    (   nonvar(E)
 346    ->  true
 347    ;   '$dtd_property'(DTD, elements(EL)),
 348        member(E, EL)
 349    ),
 350    (   nonvar(A)
 351    ->  true
 352    ;   '$dtd_property'(DTD, attributes(E, AL)),
 353        member(A, AL)
 354    ).
 355prop(notation(N, _), DTD) :-
 356    (   nonvar(N)
 357    ->  true
 358    ;   '$dtd_property'(DTD, notations(NL)),
 359        member(N, NL)
 360    ).
 361
 362dtd_property(DTD, Prop) :-
 363    prop(Prop, DTD),
 364    '$dtd_property'(DTD, Prop).
 365
 366
 367                 /*******************************
 368                 *             SGML             *
 369                 *******************************/
 370
 371%!  load_structure(+Source, -ListOfContent, :Options) is det.
 372%
 373%   Parse   Source   and   return   the   resulting   structure   in
 374%   ListOfContent. Source is handed to  open_any/5, which allows for
 375%   processing an extensible set of input sources.
 376%
 377%   A proper XML document contains only   a  single toplevel element
 378%   whose name matches the document type.   Nevertheless,  a list is
 379%   returned for consistency with  the   representation  of  element
 380%   content.
 381%
 382%   The  encoding(+Encoding)  option   is    treated   special   for
 383%   compatibility reasons:
 384%
 385%     - If `Encoding` is one of =iso-8859-1=, =us-ascii= or =utf-8=,
 386%       the stream is opened in binary mode and the option is passed
 387%       to the SGML parser.
 388%     - If `Encoding` is present, but not one of the above, the
 389%       stream is opened in text mode using the given encoding.
 390%     - Otherwise (no `Encoding`), the stream is opened in binary
 391%       mode and doing the correct decoding is left to the parser.
 392
 393load_structure(Spec, DOM, Options) :-
 394    sgml_open_options(Options, OpenOptions, SGMLOptions),
 395    setup_call_cleanup(
 396        open_any(Spec, read, In, Close, OpenOptions),
 397        load_structure_from_stream(In, DOM, SGMLOptions),
 398        close_any(Close)).
 399
 400sgml_open_options(Options, OpenOptions, SGMLOptions) :-
 401    Options = M:Plain,
 402    (   select_option(encoding(Encoding), Plain, NoEnc)
 403    ->  (   sgml_encoding(Encoding)
 404        ->  merge_options(NoEnc, [type(binary)], OpenOptions),
 405            SGMLOptions = Options
 406        ;   OpenOptions = Plain,
 407            SGMLOptions = M:NoEnc
 408        )
 409    ;   merge_options(Plain, [type(binary)], OpenOptions),
 410        SGMLOptions = Options
 411    ).
 412
 413sgml_encoding(Enc) :-
 414    downcase_atom(Enc, Enc1),
 415    sgml_encoding_l(Enc1).
 416
 417sgml_encoding_l('iso-8859-1').
 418sgml_encoding_l('us-ascii').
 419sgml_encoding_l('utf-8').
 420sgml_encoding_l('utf8').
 421sgml_encoding_l('iso_latin_1').
 422sgml_encoding_l('ascii').
 423
 424load_structure_from_stream(In, Term, M:Options) :-
 425    !,
 426    (   select_option(dtd(DTD), Options, Options1)
 427    ->  ExplicitDTD = true
 428    ;   ExplicitDTD = false,
 429        Options1 = Options
 430    ),
 431    move_front(Options1, dialect(_), Options2), % dialect sets defaults
 432    setup_call_cleanup(
 433        new_sgml_parser(Parser,
 434                        [ dtd(DTD)
 435                        ]),
 436        parse(Parser, M:Options2, TermRead, In),
 437        free_sgml_parser(Parser)),
 438    (   ExplicitDTD == true
 439    ->  (   DTD = dtd(_, DocType),
 440            dtd_property(DTD, doctype(DocType))
 441        ->  true
 442        ;   true
 443        )
 444    ;   free_dtd(DTD)
 445    ),
 446    Term = TermRead.
 447
 448move_front(Options0, Opt, Options) :-
 449    selectchk(Opt, Options0, Options1),
 450    !,
 451    Options = [Opt|Options1].
 452move_front(Options, _, Options).
 453
 454
 455parse(Parser, M:Options, Document, In) :-
 456    set_parser_options(Options, Parser, In, Options1),
 457    parser_meta_options(Options1, M, Options2),
 458    set_input_location(Parser, In),
 459    sgml_parse(Parser,
 460               [ document(Document),
 461                 source(In)
 462               | Options2
 463               ]).
 464
 465set_parser_options([], _, _, []).
 466set_parser_options([H|T], Parser, In, Rest) :-
 467    (   set_parser_option(H, Parser, In)
 468    ->  set_parser_options(T, Parser, In, Rest)
 469    ;   Rest = [H|R2],
 470        set_parser_options(T, Parser, In, R2)
 471    ).
 472
 473set_parser_option(Var, _Parser, _In) :-
 474    var(Var),
 475    !,
 476    instantiation_error(Var).
 477set_parser_option(Option, Parser, _) :-
 478    def_entity(Option, Parser),
 479    !.
 480set_parser_option(offset(Offset), _Parser, In) :-
 481    !,
 482    seek(In, Offset, bof, _).
 483set_parser_option(Option, Parser, _In) :-
 484    parser_option(Option),
 485    !,
 486    set_sgml_parser(Parser, Option).
 487set_parser_option(Name=Value, Parser, In) :-
 488    Option =.. [Name,Value],
 489    set_parser_option(Option, Parser, In).
 490
 491
 492parser_option(dialect(_)).
 493parser_option(shorttag(_)).
 494parser_option(case_sensitive_attributes(_)).
 495parser_option(case_preserving_attributes(_)).
 496parser_option(system_entities(_)).
 497parser_option(max_memory(_)).
 498parser_option(file(_)).
 499parser_option(line(_)).
 500parser_option(space(_)).
 501parser_option(number(_)).
 502parser_option(defaults(_)).
 503parser_option(doctype(_)).
 504parser_option(qualify_attributes(_)).
 505parser_option(encoding(_)).
 506parser_option(keep_prefix(_)).
 507
 508
 509def_entity(entity(Name, Value), Parser) :-
 510    get_sgml_parser(Parser, dtd(DTD)),
 511    xml_quote_attribute(Value, QValue),
 512    setup_call_cleanup(open_dtd(DTD, [], Stream),
 513                       format(Stream, '<!ENTITY ~w "~w">~n',
 514                              [Name, QValue]),
 515                       close(Stream)).
 516def_entity(xmlns(URI), Parser) :-
 517    set_sgml_parser(Parser, xmlns(URI)).
 518def_entity(xmlns(NS, URI), Parser) :-
 519    set_sgml_parser(Parser, xmlns(NS, URI)).
 520
 521%!  parser_meta_options(+Options0, +Module, -Options)
 522%
 523%   Qualify meta-calling options to the parser.
 524
 525parser_meta_options([], _, []).
 526parser_meta_options([call(When, Closure)|T0], M, [call(When, M:Closure)|T]) :-
 527    !,
 528    parser_meta_options(T0, M, T).
 529parser_meta_options([H|T0], M, [H|T]) :-
 530    parser_meta_options(T0, M, T).
 531
 532
 533%!  set_input_location(+Parser, +In:stream) is det.
 534%
 535%   Set the input location if this was not set explicitly
 536
 537set_input_location(Parser, _In) :-
 538    get_sgml_parser(Parser, file(_)),
 539    !.
 540set_input_location(Parser, In) :-
 541    stream_property(In, file_name(File)),
 542    !,
 543    set_sgml_parser(Parser, file(File)),
 544    stream_property(In, position(Pos)),
 545    set_sgml_parser(Parser, position(Pos)).
 546set_input_location(_, _).
 547
 548                 /*******************************
 549                 *           UTILITIES          *
 550                 *******************************/
 551
 552%!  load_sgml_file(+File, -DOM) is det.
 553%
 554%   Load SGML from File and unify   the resulting DOM structure with
 555%   DOM.
 556%
 557%   @deprecated     New code should use load_sgml/3.
 558
 559load_sgml_file(File, Term) :-
 560    load_sgml(File, Term, []).
 561
 562%!  load_xml_file(+File, -DOM) is det.
 563%
 564%   Load XML from File and unify   the  resulting DOM structure with
 565%   DOM.
 566%
 567%   @deprecated     New code should use load_xml/3.
 568
 569load_xml_file(File, Term) :-
 570    load_xml(File, Term, []).
 571
 572%!  load_html_file(+File, -DOM) is det.
 573%
 574%   Load HTML from File and unify   the resulting DOM structure with
 575%   DOM.
 576%
 577%   @deprecated     New code should use load_html/3.
 578
 579load_html_file(File, DOM) :-
 580    load_html(File, DOM, []).
 581
 582%!  load_html(+Input, -DOM, +Options) is det.
 583%
 584%   Load HTML text from Input and  unify the resulting DOM structure
 585%   with DOM. Options are passed   to load_structure/3, after adding
 586%   the following default options:
 587%
 588%     - dtd(DTD)
 589%     Pass the DTD for HTML as obtained using dtd(html, DTD).
 590%     - dialect(Dialect)
 591%     Current dialect from the Prolog flag =html_dialect=
 592%     - max_errors(-1)
 593%     - syntax_errors(quiet)
 594%     Most HTML encountered in the wild contains errors. Even in the
 595%     context of errors, the resulting DOM term is often a
 596%     reasonable guess at the intend of the author.
 597%
 598%   You may also want to use  the library(http/http_open) to support
 599%   loading from HTTP and HTTPS URLs. For example:
 600%
 601%   ==
 602%   :- use_module(library(http/http_open)).
 603%   :- use_module(library(sgml)).
 604%
 605%   load_html_url(URL, DOM) :-
 606%       load_html(URL, DOM, []).
 607%   ==
 608
 609load_html(File, Term, M:Options) :-
 610    current_prolog_flag(html_dialect, Dialect),
 611    dtd(Dialect, DTD),
 612    merge_options(Options,
 613                  [ dtd(DTD),
 614                    dialect(Dialect),
 615                    max_errors(-1),
 616                    syntax_errors(quiet)
 617                  ], Options1),
 618    load_structure(File, Term, M:Options1).
 619
 620%!  load_xml(+Input, -DOM, +Options) is det.
 621%
 622%   Load XML text from Input and   unify the resulting DOM structure
 623%   with DOM. Options are passed   to load_structure/3, after adding
 624%   the following default options:
 625%
 626%     - dialect(xml)
 627
 628load_xml(Input, DOM, M:Options) :-
 629    merge_options(Options,
 630                  [ dialect(xml)
 631                  ], Options1),
 632    load_structure(Input, DOM, M:Options1).
 633
 634%!  load_sgml(+Input, -DOM, +Options) is det.
 635%
 636%   Load SGML text from Input and  unify the resulting DOM structure
 637%   with DOM. Options are passed   to load_structure/3, after adding
 638%   the following default options:
 639%
 640%     - dialect(sgml)
 641
 642load_sgml(Input, DOM, M:Options) :-
 643    merge_options(Options,
 644                  [ dialect(sgml)
 645                  ], Options1),
 646    load_structure(Input, DOM, M:Options1).
 647
 648
 649
 650                 /*******************************
 651                 *            ENCODING          *
 652                 *******************************/
 653
 654%!  xml_quote_attribute(+In, -Quoted) is det.
 655%!  xml_quote_cdata(+In, -Quoted) is det.
 656%
 657%   Backward  compatibility  for  versions  that  allow  to  specify
 658%   encoding. All characters that cannot fit the encoding are mapped
 659%   to XML character entities (&#dd;).  Using   ASCII  is the safest
 660%   value.
 661
 662xml_quote_attribute(In, Quoted) :-
 663    xml_quote_attribute(In, Quoted, ascii).
 664
 665xml_quote_cdata(In, Quoted) :-
 666    xml_quote_cdata(In, Quoted, ascii).
 667
 668%!  xml_name(+Atom) is semidet.
 669%
 670%   True if Atom is a valid XML name.
 671
 672xml_name(In) :-
 673    xml_name(In, ascii).
 674
 675
 676                 /*******************************
 677                 *    XML CHARACTER CLASSES     *
 678                 *******************************/
 679
 680%!  xml_basechar(+CodeOrChar) is semidet.
 681%!  xml_ideographic(+CodeOrChar) is semidet.
 682%!  xml_combining_char(+CodeOrChar) is semidet.
 683%!  xml_digit(+CodeOrChar) is semidet.
 684%!  xml_extender(+CodeOrChar) is semidet.
 685%
 686%   XML  character  classification   predicates.    Each   of  these
 687%   predicates accept both a character   (one-character  atom) and a
 688%   code (integer).
 689%
 690%   @see http://www.w3.org/TR/2006/REC-xml-20060816
 691
 692
 693                 /*******************************
 694                 *         TYPE CHECKING        *
 695                 *******************************/
 696
 697%!  xml_is_dom(@Term) is semidet.
 698%
 699%   True  if  term  statisfies   the    structure   as  returned  by
 700%   load_structure/3 and friends.
 701
 702xml_is_dom(0) :- !, fail.               % catch variables
 703xml_is_dom(List) :-
 704    is_list(List),
 705    !,
 706    xml_is_content_list(List).
 707xml_is_dom(Term) :-
 708    xml_is_element(Term).
 709
 710xml_is_content_list([]).
 711xml_is_content_list([H|T]) :-
 712    xml_is_content(H),
 713    xml_is_content_list(T).
 714
 715xml_is_content(0) :- !, fail.
 716xml_is_content(pi(Pi)) :-
 717    !,
 718    atom(Pi).
 719xml_is_content(CDATA) :-
 720    atom(CDATA),
 721    !.
 722xml_is_content(CDATA) :-
 723    string(CDATA),
 724    !.
 725xml_is_content(Term) :-
 726    xml_is_element(Term).
 727
 728xml_is_element(element(Name, Attributes, Content)) :-
 729    dom_name(Name),
 730    dom_attributes(Attributes),
 731    xml_is_content_list(Content).
 732
 733dom_name(NS:Local) :-
 734    atom(NS),
 735    atom(Local),
 736    !.
 737dom_name(Local) :-
 738    atom(Local).
 739
 740dom_attributes(0) :- !, fail.
 741dom_attributes([]).
 742dom_attributes([H|T]) :-
 743    dom_attribute(H),
 744    dom_attributes(T).
 745
 746dom_attribute(Name=Value) :-
 747    dom_name(Name),
 748    atomic(Value).
 749
 750
 751                 /*******************************
 752                 *            MESSAGES          *
 753                 *******************************/
 754:- multifile
 755    prolog:message/3.
 756
 757%       Catch messages.  sgml/4 is generated by the SGML2PL binding.
 758
 759prolog:message(sgml(Parser, File, Line, Message)) -->
 760    { get_sgml_parser(Parser, dialect(Dialect))
 761    },
 762    [ 'SGML2PL(~w): ~w:~w: ~w'-[Dialect, File, Line, Message] ].
 763
 764
 765                 /*******************************
 766                 *         XREF SUPPORT         *
 767                 *******************************/
 768
 769:- multifile
 770    prolog:called_by/2.
 771
 772prolog:called_by(sgml_parse(_, Options), Called) :-
 773    findall(Meta, meta_call_term(_, Meta, Options), Called).
 774
 775meta_call_term(T, G+N, Options) :-
 776    T = call(Event, G),
 777    pmember(T, Options),
 778    call_params(Event, Term),
 779    functor(Term, _, N).
 780
 781pmember(X, List) :-                     % member for partial lists
 782    nonvar(List),
 783    List = [H|T],
 784    (   X = H
 785    ;   pmember(X, T)
 786    ).
 787
 788call_params(begin, begin(tag,attributes,parser)).
 789call_params(end,   end(tag,parser)).
 790call_params(cdata, cdata(cdata,parser)).
 791call_params(pi,    pi(cdata,parser)).
 792call_params(decl,  decl(cdata,parser)).
 793call_params(error, error(severity,message,parser)).
 794call_params(xmlns, xmlns(namespace,url,parser)).
 795call_params(urlns, urlns(url,url,parser)).
 796
 797                 /*******************************
 798                 *           SANDBOX            *
 799                 *******************************/
 800
 801:- multifile
 802    sandbox:safe_primitive/1,
 803    sandbox:safe_meta_predicate/1.
 804
 805sandbox:safe_meta_predicate(sgml:load_structure/3).
 806sandbox:safe_primitive(sgml:dtd(Dialect, _)) :-
 807    dtd_alias(Dialect, _).
 808sandbox:safe_primitive(sgml:xml_quote_attribute(_,_,_)).
 809sandbox:safe_primitive(sgml:xml_quote_cdata(_,_,_)).
 810sandbox:safe_primitive(sgml:xml_name(_,_)).
 811sandbox:safe_primitive(sgml:xml_basechar(_)).
 812sandbox:safe_primitive(sgml:xml_ideographic(_)).
 813sandbox:safe_primitive(sgml:xml_combining_char(_)).
 814sandbox:safe_primitive(sgml:xml_digit(_)).
 815sandbox:safe_primitive(sgml:xml_extender(_)).
 816sandbox:safe_primitive(sgml:iri_xml_namespace(_,_,_)).
 817sandbox:safe_primitive(sgml:xsd_number_string(_,_)).
 818sandbox:safe_primitive(sgml:xsd_time_string(_,_,_)).