View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2010-2015, University of Amsterdam
   7    All rights reserved.
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions
  11    are met:
  12
  13    1. Redistributions of source code must retain the above copyright
  14       notice, this list of conditions and the following disclaimer.
  15
  16    2. Redistributions in binary form must reproduce the above copyright
  17       notice, this list of conditions and the following disclaimer in
  18       the documentation and/or other materials provided with the
  19       distribution.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32    POSSIBILITY OF SUCH DAMAGE.
  33*/
  34
  35:- module(rdf,
  36          [ load_rdf/2,                 % +File, -Triples
  37            load_rdf/3,                 % +File, -Triples, :Options
  38            xml_to_rdf/3,               % +XML, -Triples, +Options
  39            process_rdf/3               % +File, :OnTriples, :Options
  40          ]).
  41
  42:- meta_predicate
  43    load_rdf(+, -, :),
  44    process_rdf(+, :, :).
  45
  46:- use_module(library(sgml)).           % Basic XML loading
  47:- use_module(library(option)).         % option/3
  48:- use_module(library(lists)).
  49:- use_module(rdf_parser).              % Basic parser
  50:- use_module(rdf_triple).              % Generate triples
  51
  52%!  load_rdf(+File, -Triples) is det.
  53%!  load_rdf(+File, -Triples, :Options) is det.
  54%
  55%   Parse an XML file holding an RDF term into a list of RDF triples.
  56%   see rdf_triple.pl for a definition of the output format. Options:
  57%
  58%           * base_uri(+URI)
  59%           URI to use as base
  60%
  61%           * expand_foreach(+Bool)
  62%           Apply each(Container, Pred, Object) on the members of
  63%           Container
  64%
  65%           * namespaces(-Namespaces:list(NS=URL))
  66%           Return list of namespaces declared using xmlns:NS=URL in
  67%           the document.  This can be used to update the namespace
  68%           list with rdf_register_ns/2.
  69%
  70%   @see    Use process_rdf/3 for processing large documents in
  71%           _|call-back|_ style.
  72
  73load_rdf(File, Triples) :-
  74    load_rdf(File, Triples, []).
  75
  76load_rdf(File, Triples, M:Options0) :-
  77    entity_options(Options0, EntOptions, Options1),
  78    meta_options(load_meta_option, M:Options1, Options),
  79    init_ns_collect(Options, NSList),
  80    load_structure(File,
  81                   [ RDFElement
  82                   ],
  83                   [ dialect(xmlns),
  84                     space(sgml),
  85                     call(xmlns, rdf:on_xmlns)
  86                   | EntOptions
  87                   ]),
  88    rdf_start_file(Options, Cleanup),
  89    call_cleanup(xml_to_rdf(RDFElement, Triples0, Options),
  90                 rdf_end_file(Cleanup)),
  91    exit_ns_collect(NSList),
  92    post_process(Options, Triples0, Triples).
  93
  94entity_options([], [], []).
  95entity_options([H|T0], Entities, Rest) :-
  96    (   H = entity(_,_)
  97    ->  Entities = [H|ET],
  98        entity_options(T0, ET, Rest)
  99    ;   Rest = [H|RT],
 100        entity_options(T0, Entities, RT)
 101    ).
 102
 103load_meta_option(convert_typed_literal).
 104
 105%!  xml_to_rdf(+XML, -Triples, +Options)
 106
 107xml_to_rdf(XML, Triples, Options) :-
 108    is_list(Options),
 109    !,
 110    make_rdf_state(Options, State, _),
 111    xml_to_plrdf(XML, RDF, State),
 112    rdf_triples(RDF, Triples).
 113xml_to_rdf(XML, BaseURI, Triples) :-
 114    atom(BaseURI),
 115    !,
 116    xml_to_rdf(XML, Triples, [base_uri(BaseURI)]).
 117
 118
 119                 /*******************************
 120                 *       POST-PROCESSING        *
 121                 *******************************/
 122
 123post_process([], Triples, Triples).
 124post_process([expand_foreach(true)|T], Triples0, Triples) :-
 125    !,
 126    expand_each(Triples0, Triples1),
 127    post_process(T, Triples1, Triples).
 128post_process([_|T], Triples0, Triples) :-
 129    !,
 130    post_process(T, Triples0, Triples).
 131
 132
 133                 /*******************************
 134                 *            EXPAND            *
 135                 *******************************/
 136
 137expand_each(Triples0, Triples) :-
 138    select(rdf(each(Container), Pred, Object),
 139           Triples0, Triples1),
 140    !,
 141    each_triples(Triples1, Container, Pred, Object, Triples2),
 142    expand_each(Triples2, Triples).
 143expand_each(Triples, Triples).
 144
 145each_triples([], _, _, _, []).
 146each_triples([H0|T0], Container, P, O,
 147             [H0, rdf(S,P,O)|T]) :-
 148    H0 = rdf(Container, rdf:A, S),
 149    member_attribute(A),
 150    !,
 151    each_triples(T0, Container, P, O, T).
 152each_triples([H|T0], Container, P, O, [H|T]) :-
 153    each_triples(T0, Container, P, O, T).
 154
 155member_attribute(A) :-
 156    sub_atom(A, 0, _, _, '_').      % must check number?
 157
 158
 159                 /*******************************
 160                 *           BIG FILES          *
 161                 *******************************/
 162
 163%!  process_rdf(+Input, :OnObject, :Options)
 164%
 165%   Process RDF from Input. Input is either an atom or a term of the
 166%   format stream(Handle). For each   encountered  description, call
 167%   OnObject(+Triples) to handle the  triples   resulting  from  the
 168%   description. Defined Options are:
 169%
 170%           * base_uri(+URI)
 171%           Determines the reference URI.
 172%
 173%           * db(DB)
 174%           When loading from a stream, the source is taken from
 175%           this option or -if non-existent- from base_uri.
 176%
 177%           * lang(LanguageID)
 178%           Set initial language (as xml:lang)
 179%
 180%           * convert_typed_literal(:Convertor)
 181%           Call Convertor(+Type, +Content, -RDFObject) to create
 182%           a triple rdf(S, P, RDFObject) instead of rdf(S, P,
 183%           literal(type(Type, Content)).
 184%
 185%           *  namespaces(-Namespaces:list(NS=URL))
 186%           Return list of namespaces declared using xmlns:NS=URL in
 187%           the document.  This can be used to update the namespace
 188%           list with rdf_register_ns/2.
 189%
 190%           * entity(Name, Value)
 191%           Overrule entity values found in the file
 192%
 193%           * embedded(Boolean)
 194%           If =true=, do not give warnings if rdf:RDF is embedded
 195%           in other XML data.
 196
 197process_rdf(File, OnObject, M:Options0) :-
 198    is_list(Options0),
 199    !,
 200    entity_options(Options0, EntOptions, Options1),
 201    meta_options(load_meta_option, M:Options1, Options2),
 202    option(base_uri(BaseURI), Options2, ''),
 203    rdf_start_file(Options2, Cleanup),
 204    strip_module(OnObject, Module, Pred),
 205    b_setval(rdf_object_handler, Module:Pred),
 206    nb_setval(rdf_options, Options2),
 207    nb_setval(rdf_state, -),
 208    init_ns_collect(Options2, NSList),
 209    (   File = stream(In)
 210    ->  Source = BaseURI
 211    ;   is_stream(File)
 212    ->  In = File,
 213        option(graph(Source), Options2, BaseURI)
 214    ;   open(File, read, In, [type(binary)]),
 215        Close = In,
 216        Source = File
 217    ),
 218    new_sgml_parser(Parser, [dtd(DTD)]),
 219    def_entities(EntOptions, DTD),
 220    (   Source \== []
 221    ->  set_sgml_parser(Parser, file(Source))
 222    ;   true
 223    ),
 224    set_sgml_parser(Parser, dialect(xmlns)),
 225    set_sgml_parser(Parser, space(sgml)),
 226    do_process_rdf(Parser, In, NSList, Close, Cleanup, Options2).
 227process_rdf(File, BaseURI, OnObject) :-
 228    process_rdf(File, OnObject, [base_uri(BaseURI)]).
 229
 230def_entities([], _).
 231def_entities([entity(Name, Value)|T], DTD) :-
 232    !,
 233    def_entity(DTD, Name, Value),
 234    def_entities(T, DTD).
 235def_entities([_|T0], DTD) :-
 236    def_entities(T0, DTD).
 237
 238def_entity(DTD, Name, Value) :-
 239    open_dtd(DTD, [], Stream),
 240    xml_quote_attribute(Value, QValue),
 241    format(Stream, '<!ENTITY ~w "~w">~n', [Name, QValue]),
 242    close(Stream).
 243
 244
 245do_process_rdf(Parser, In, NSList, Close, Cleanup, Options) :-
 246    call_cleanup((   sgml_parse(Parser,
 247                                [ source(In),
 248                                  call(begin, on_begin),
 249                                  call(xmlns, on_xmlns)
 250                                | Options
 251                                ]),
 252                     exit_ns_collect(NSList)
 253                 ),
 254                 cleanup_process(Close, Cleanup, Parser)).
 255
 256cleanup_process(In, Cleanup, Parser) :-
 257    (   var(In)
 258    ->  true
 259    ;   close(In)
 260    ),
 261    free_sgml_parser(Parser),
 262    nb_delete(rdf_options),
 263    nb_delete(rdf_object_handler),
 264    nb_delete(rdf_state),
 265    nb_delete(rdf_nslist),
 266    rdf_end_file(Cleanup).
 267
 268on_begin(NS:'RDF', Attr, _) :-
 269    rdf_name_space(NS),
 270    !,
 271    nb_getval(rdf_options, Options),
 272    make_rdf_state(Options, State0, _),
 273    rdf_modify_state(Attr, State0, State),
 274    nb_setval(rdf_state, State).
 275on_begin(Tag, Attr, Parser) :-
 276    nb_getval(rdf_state, State),
 277    (   State == (-)
 278    ->  nb_getval(rdf_options, RdfOptions),
 279        (   memberchk(embedded(true), RdfOptions)
 280        ->  true
 281        ;   print_message(warning, rdf(unexpected(Tag, Parser)))
 282        )
 283    ;   get_sgml_parser(Parser, line(Start)),
 284        get_sgml_parser(Parser, file(File)),
 285        sgml_parse(Parser,
 286                   [ document(Content),
 287                     parse(content)
 288                   ]),
 289        b_getval(rdf_object_handler, OnTriples),
 290        element_to_plrdf(element(Tag, Attr, Content), Objects, State),
 291        rdf_triples(Objects, Triples),
 292        call(OnTriples, Triples, File:Start)
 293    ).
 294
 295%!  on_xmlns(+NS, +URL, +Parser)
 296%
 297%   Build up the list of   encountered xmlns:NS=URL declarations. We
 298%   use  destructive  assignment  here   as    an   alternative   to
 299%   assert/retract, ensuring thread-safety and better performance.
 300
 301on_xmlns(NS, URL, _Parser) :-
 302    (   nb_getval(rdf_nslist, List),
 303        List = list(L0)
 304    ->  nb_linkarg(1, List, [NS=URL|L0])
 305    ;   true
 306    ).
 307
 308init_ns_collect(Options, NSList) :-
 309    (   option(namespaces(NSList), Options, -),
 310        NSList \== (-)
 311    ->  nb_setval(rdf_nslist, list([]))
 312    ;   nb_setval(rdf_nslist, -),
 313        NSList = (-)
 314    ).
 315
 316exit_ns_collect(NSList) :-
 317    (   NSList == (-)
 318    ->  true
 319    ;   nb_getval(rdf_nslist, list(NSList))
 320    ).
 321
 322
 323
 324                 /*******************************
 325                 *            MESSAGES          *
 326                 *******************************/
 327
 328:- multifile
 329    prolog:message/3.
 330
 331%       Catch messages.  sgml/4 is generated by the SGML2PL binding.
 332
 333prolog:message(rdf(unparsed(Data))) -->
 334    { phrase(unparse_xml(Data), XML)
 335    },
 336    [ 'RDF: Failed to interpret "~s"'-[XML] ].
 337prolog:message(rdf(shared_blank_nodes(N))) -->
 338    [ 'RDF: Shared ~D blank nodes'-[N] ].
 339prolog:message(rdf(not_a_name(Name))) -->
 340    [ 'RDF: argument to rdf:ID is not an XML name: ~p'-[Name] ].
 341prolog:message(rdf(redefined_id(Id))) -->
 342    [ 'RDF: rdf:ID ~p: multiple definitions'-[Id] ].
 343prolog:message(rdf(unexpected(Tag, Parser))) -->
 344    { get_sgml_parser(Parser, file(File)),
 345      get_sgml_parser(Parser, line(Line))
 346    },
 347    [ 'RDF: ~w:~d: Unexpected element ~w'-[File, Line, Tag] ].
 348
 349
 350                 /*******************************
 351                 *          XML-TO-TEXT         *
 352                 *******************************/
 353
 354unparse_xml([]) -->
 355    !,
 356    [].
 357unparse_xml([H|T]) -->
 358    !,
 359    unparse_xml(H),
 360    unparse_xml(T).
 361unparse_xml(Atom) -->
 362    { atom(Atom)
 363    },
 364    !,
 365    atom(Atom).
 366unparse_xml(element(Name, Attr, Content)) -->
 367    "<",
 368    identifier(Name),
 369    attributes(Attr),
 370    (   { Content == []
 371        }
 372    ->  "/>"
 373    ;   ">",
 374        unparse_xml(Content)
 375    ).
 376
 377attributes([]) -->
 378    [].
 379attributes([H|T]) -->
 380    attribute(H),
 381    attributes(T).
 382
 383attribute(Name=Value) -->
 384    " ",
 385    identifier(Name),
 386    "=",
 387    value(Value).
 388
 389identifier(NS:Local) -->
 390    !,
 391    "{", atom(NS), "}",
 392    atom(Local).
 393identifier(Local) -->
 394    atom(Local).
 395
 396atom(Atom, Text, Rest) :-
 397    atom_codes(Atom, Chars),
 398    append(Chars, Rest, Text).
 399
 400value(Value) -->
 401    { atom_codes(Value, Chars)
 402    },
 403    "\"",
 404    quoted(Chars),
 405    "\"".
 406
 407quoted([]) -->
 408    [].
 409quoted([H|T]) -->
 410    quote(H),
 411    !,
 412    quoted(T).
 413
 414quote(0'<) --> "&lt;".
 415quote(0'>) --> "&gt;".
 416quote(0'") --> "&quot;".
 417quote(0'&) --> "&amp;".
 418quote(X)   --> [X].
 419
 420
 421                 /*******************************
 422                 *             XREF             *
 423                 *******************************/
 424
 425:- multifile prolog:meta_goal/2.
 426prolog:meta_goal(process_rdf(_,G,_), [G+2]).