View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2013-2015, VU University Amsterdam
   7    All rights reserved.
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions
  11    are met:
  12
  13    1. Redistributions of source code must retain the above copyright
  14       notice, this list of conditions and the following disclaimer.
  15
  16    2. Redistributions in binary form must reproduce the above copyright
  17       notice, this list of conditions and the following disclaimer in
  18       the documentation and/or other materials provided with the
  19       distribution.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32    POSSIBILITY OF SUCH DAMAGE.
  33*/
  34
  35:- module(rdf_ntriples,
  36          [ rdf_read_ntriples/3,        % +Input, -Triples, +Options
  37            rdf_read_nquads/3,          % +Input, -Quads, +Options
  38            rdf_process_ntriples/3,     % +Input, :CallBack, +Options
  39
  40            read_ntriple/2,             % +Stream, -Triple
  41            read_nquad/2,               % +Stream, -Quad
  42            read_ntuple/2               % +Stream, -TripleOrQuad
  43          ]).
  44:- use_module(library(record)).
  45:- use_module(library(uri)).
  46:- use_module(library(option)).
  47:- use_module(library(http/http_open)).
  48:- use_module(library(semweb/rdf_db)).
  49:- use_foreign_library(foreign(ntriples)).
  50
  51/** <module> Process files in the RDF N-Triples format
  52
  53The library(semweb/rdf_ntriples) provides a  fast   reader  for  the RDF
  54N-Triples and N-Quads format. N-Triples is   a simple format, originally
  55used to support the W3C RDF  test   suites.  The current format has been
  56extended   and   is   a   subset    of     the    Turtle   format   (see
  57library(semweb/turtle)).
  58
  59The API of this library is   almost identical to library(semweb/turtle).
  60This module provides a plugin  into   rdf_load/2,  making this predicate
  61support the format =ntriples= and =nquads=.
  62
  63@see http://www.w3.org/TR/n-triples/
  64@tbd Sync with RDF 1.1. specification.
  65*/
  66
  67:- predicate_options(rdf_read_ntriples/3, 3,
  68                     [ anon_prefix(any), % atom or node(_)
  69                       base_uri(atom),
  70                       error_count(-integer),
  71                       on_error(oneof([warning,error]))
  72                     ]).
  73:- predicate_options(rdf_read_nquads/3, 3,
  74                     [ anon_prefix(any), % atom or node(_)
  75                       base_uri(atom),
  76                       error_count(-integer),
  77                       on_error(oneof([warning,error])),
  78                       graph(atom)
  79                     ]).
  80:- predicate_options(rdf_process_ntriples/3, 3,
  81                     [ graph(atom),
  82                       pass_to(rdf_read_ntriples/3, 3)
  83                     ]).
  84
  85:- meta_predicate
  86    rdf_process_ntriples(+,2,+).
  87
  88
  89%!  read_ntriple(+Stream, -Triple) is det.
  90%
  91%   Read the next triple from Stream as   Triple. Stream must have a
  92%   byte-oriented encoding and must contain pure ASCII text.
  93%
  94%   @param  Triple is a term triple(Subject,Predicate,Object).
  95%           Arguments follow the normal conventions of the RDF
  96%           libraries.  NodeID elements are mapped to node(Id).
  97%           If end-of-file is reached, Triple is unified with
  98%           =end_of_file=.
  99%   @error  syntax_error(Message) on syntax errors
 100
 101%!  read_nquad(+Stream, -Quad) is det.
 102%
 103%   Read the next quad from  Stream  as   Quad.  Stream  must have a
 104%   byte-oriented encoding and must contain pure ASCII text.
 105%
 106%   @param  Quad is a term quad(Subject,Predicate,Object,Graph).
 107%           Arguments follow the normal conventions of the RDF
 108%           libraries.  NodeID elements are mapped to node(Id).
 109%           If end-of-file is reached, Quad is unified with
 110%           =end_of_file=.
 111%   @error  syntax_error(Message) on syntax errors
 112
 113%!  read_ntuple(+Stream, -Tuple) is det.
 114%
 115%   Read the next triple or quad from  Stream as Tuple. Tuple is one
 116%   of the terms below.  See   read_ntriple/2  and  read_nquad/2 for
 117%   details.
 118%
 119%     - triple(Subject,Predicate,Object)
 120%     - quad(Subject,Predicate,Object,Graph).
 121
 122:- record nt_state(anon_prefix,
 123               graph,
 124               on_error:oneof([warning,error])=warning,
 125               format:oneof([ntriples,nquads]),
 126               error_count=0).
 127
 128
 129%!  rdf_read_ntriples(+Input, -Triples, +Options) is det.
 130%!  rdf_read_nquads(+Input, -Quads, +Options) is det.
 131%
 132%   True when Triples/Quads is a list   of triples/quads from Input.
 133%   Options:
 134%
 135%     * anon_prefix(+AtomOrNode)
 136%     Prefix nodeIDs with this atom.  If AtomOrNode is the term
 137%     node(_), bnodes are returned as node(Id).
 138%     * base_uri(+Atom)
 139%     Defines the default anon_prefix as _:<baseuri>_
 140%     * on_error(Action)
 141%     One of =warning= (default) or =error=
 142%     * error_count(-Count)
 143%     If =on_error= is =warning=, unify Count with th number of
 144%     errors.
 145%     * graph(+Graph)
 146%     For rdf_read_nquads/3, this defines the graph associated
 147%     to _triples_ loaded from the input.  For rdf_read_ntriples/3
 148%     this opion is ignored.
 149%
 150%   @arg Triples is a list of rdf(Subject, Predicate, Object)
 151%   @arg Quads is a list of rdf(Subject, Predicate, Object, Graph)
 152
 153rdf_read_ntriples(Input, Triples, Options) :-
 154    rdf_read_ntuples(Input, Triples, [format(ntriples)|Options]).
 155
 156rdf_read_nquads(Input, Triples, Options) :-
 157    rdf_read_ntuples(Input, Triples, [format(nquads)|Options]).
 158
 159
 160rdf_read_ntuples(Input, Triples, Options) :-
 161    setup_call_cleanup(
 162        open_input(Input, Stream, Close),
 163        (   init_state(Input, Options, State0),
 164            read_ntuples(Stream, Triples, State0, State)
 165        ),
 166        Close),
 167    option(error_count(Count), Options, _),
 168    nt_state_error_count(State, Count).
 169
 170%!  rdf_process_ntriples(+Input, :CallBack, +Options)
 171%
 172%   Call-back interface, compatible with the   other triple readers.
 173%   In  addition  to  the  options  from  rdf_read_ntriples/3,  this
 174%   processes the option graph(Graph).
 175%
 176%   @param  CallBack is called as call(CallBack, Triples, Graph),
 177%           where Triples is a list holding a single rdf(S,P,O)
 178%           triple.  Graph is passed from the =graph= option and
 179%           unbound if this option is omitted.
 180
 181rdf_process_ntriples(Input, CallBack, Options) :-
 182    setup_call_cleanup(
 183        open_input(Input, Stream, Close),
 184        (   init_state(Input, Options, State0),
 185            process_ntriple(Stream, CallBack, State0, State)
 186        ),
 187        Close),
 188    option(error_count(Count), Options, _),
 189    nt_state_error_count(State, Count).
 190
 191
 192%!  read_ntuples(+Stream, -Triples, +State0, -State)
 193
 194read_ntuples(Stream, Triples, State0, State) :-
 195    read_ntuple(Stream, Triple0, State0, State1),
 196    (   Triple0 == end_of_file
 197    ->  Triples = [],
 198        State = State1
 199    ;   map_nodes(Triple0, Triple, State1, State2),
 200        Triples = [Triple|More],
 201        read_ntuples(Stream, More, State2, State)
 202    ).
 203
 204%!  process_ntriple(+Stream, :CallBack, +State0, -State)
 205
 206process_ntriple(Stream, CallBack, State0, State) :-
 207    read_ntuple(Stream, Triple0, State0, State1),
 208    (   Triple0 == end_of_file
 209    ->  State = State1
 210    ;   map_nodes(Triple0, Triple, State1, State2),
 211        nt_state_graph(State2, Graph),
 212        call(CallBack, [Triple], Graph),
 213        process_ntriple(Stream, CallBack, State2, State)
 214    ).
 215
 216%!  read_ntuple(+Stream, -Tuple, +State0, -State) is det.
 217%
 218%   True when Tuple is the next triple on Stream. May increment
 219%   the error count on State.
 220
 221read_ntuple(Stream, Triple, State0, State) :-
 222    nt_state_on_error(State0, error),
 223    !,
 224    read_ntuple(Stream, Triple, State0),
 225    State = State0.
 226read_ntuple(Stream, Triple, State0, State) :-
 227    catch(read_ntuple(Stream, Triple, State0), E, true),
 228    (   var(E)
 229    ->  State = State0
 230    ;   print_message(warning, E),
 231        nt_state_error_count(State0, EC0),
 232        EC is EC0+1,
 233        set_error_count_of_nt_state(EC, State0, State1),
 234        read_ntuple(Stream, Triple, State1, State)
 235    ).
 236
 237read_ntuple(Stream, Triple, State0) :-
 238    nt_state_format(State0, Format),
 239    format_read_ntuple(Format, Stream, Triple, State0).
 240
 241format_read_ntuple(ntriples, Stream, Triple, _) :-
 242    !,
 243    read_ntriple(Stream, Triple).
 244format_read_ntuple(nquads, Stream, Quad, State) :-
 245    !,
 246    read_ntuple(Stream, Tuple),
 247    to_quad(Tuple, Quad, State).
 248
 249to_quad(Quad, Quad, _) :-
 250    functor(Quad, quad, 4),
 251    !.
 252to_quad(triple(S,P,O), quad(S,P,O,Graph), State) :-
 253    nt_state_graph(State, Graph).
 254to_quad(end_of_file, end_of_file, _).
 255
 256
 257map_nodes(triple(S0,P0,O0), rdf(S,P,O), State0, State) :-
 258    map_node(S0, S, State0, State1),
 259    map_node(P0, P, State1, State2),
 260    map_node(O0, O, State2, State).
 261map_nodes(quad(S0,P0,O0,G0), rdf(S,P,O,G), State0, State) :-
 262    map_node(S0, S, State0, State1),
 263    map_node(P0, P, State1, State2),
 264    map_node(O0, O, State2, State3),
 265    map_node(G0, G, State3, State).
 266
 267map_node(node(NodeId), BNode, State, State) :-
 268    nt_state_anon_prefix(State, Prefix),
 269    atom(Prefix),
 270    !,
 271    atom_concat(Prefix, NodeId, BNode).
 272map_node(Node, Node, State, State).
 273
 274
 275%!  open_input(+Input, -Stream, -Close) is det.
 276%
 277%   Open input for reading ntriples. The  default encoding is UTF-8.
 278%   If the input has a different encoding,   Input  must be a stream
 279%   with the correct encoding and the stream type must be =text=.
 280
 281open_input(stream(Stream), Stream, Close) :-
 282    !,
 283    (   stream_property(Stream, type(binary))
 284    ->  set_stream(Stream, encoding(utf8)),
 285        Close = set_stream(Stream, type(binary))
 286    ;   stream_property(Stream, encoding(Old)),
 287        (   n3_encoding(Old)
 288        ->  true
 289        ;   domain_error(ntriples_encoding, Old)
 290        ),
 291        Close = true
 292    ).
 293open_input(Stream, Stream, Close) :-
 294    is_stream(Stream),
 295    !,
 296    open_input(stream(Stream), Stream, Close).
 297open_input(atom(Atom), Stream, close(Stream)) :-
 298    !,
 299    atom_to_memory_file(Atom, MF),
 300    open_memory_file(MF, read, Stream, [free_on_close(true)]).
 301open_input(URL, Stream, close(Stream)) :-
 302    (   sub_atom(URL, 0, _, _, 'http://')
 303    ;   sub_atom(URL, 0, _, _, 'https://')
 304    ),
 305    !,
 306    http_open(URL, Stream, []),
 307    set_stream(Stream, encoding(utf8)).
 308open_input(URL, Stream, close(Stream)) :-
 309    uri_file_name(URL, Path),
 310    !,
 311    open(Path, read, Stream, [encoding(utf8)]).
 312open_input(File, Stream, close(Stream)) :-
 313    absolute_file_name(File, Path,
 314                       [ access(read),
 315                         extensions(['', nt, ntriples])
 316                       ]),
 317    open(Path, read, Stream, [encoding(utf8)]).
 318
 319n3_encoding(octet).
 320n3_encoding(ascii).
 321n3_encoding(iso_latin_1).
 322n3_encoding(utf8).
 323n3_encoding(text).
 324
 325%!  init_state(+Input, +Options, -State) is det.
 326
 327init_state(In, Options, State) :-
 328    (   option(base_uri(BaseURI), Options)
 329    ->  true
 330    ;   In = stream(_)
 331    ->  BaseURI = []
 332    ;   is_stream(In)
 333    ->  BaseURI = []
 334    ;   In = atom(_)
 335    ->  BaseURI = []
 336    ;   uri_is_global(In),
 337        \+ is_absolute_file_name(In)        % Avoid C:Path in Windows
 338    ->  uri_normalized(In, BaseURI)
 339    ;   uri_file_name(BaseURI, In)
 340    ),
 341    (   option(anon_prefix(Prefix), Options)
 342    ->  true
 343    ;   BaseURI == []
 344    ->  Prefix = '_:genid'
 345    ;   atom_concat('_:', BaseURI, Prefix)
 346    ),
 347    option(on_error(OnError), Options, warning),
 348    % If the format is not set explicitly we assume N-Triples.
 349    % The format option _must_ be set before make_nt_state/2.
 350    option(format(Format), Options, ntriples),
 351    rdf_db:graph(Options, Graph),
 352    (   var(Graph)
 353    ->  Graph = user
 354    ;   true
 355    ),
 356    make_nt_state([ anon_prefix(Prefix),
 357                    on_error(OnError),
 358                    format(Format),
 359                    graph(Graph)
 360                  ], State).
 361
 362
 363                 /*******************************
 364                 *          RDF-DB HOOK         *
 365                 *******************************/
 366
 367:- multifile
 368    rdf_db:rdf_load_stream/3,
 369    rdf_db:rdf_file_type/2.
 370
 371%!  rdf_db:rdf_load_stream(+Format, +Stream, :Options) is semidet.
 372%
 373%   Plugin rule that supports loading   the  =ntriples= and =nquads=
 374%   formats.
 375
 376rdf_db:rdf_load_stream(ntriples, Stream, _Module:Options) :-
 377    rdf_db:graph(Options, Graph),
 378    rdf_transaction((  rdf_process_ntriples(Stream, assert_tuples, Options),
 379                       rdf_set_graph(Graph, modified(false))
 380                    ),
 381                    parse(Graph)).
 382rdf_db:rdf_load_stream(nquads, Stream, _Module:Options) :-
 383    rdf_db:graph(Options, Graph),
 384    (   var(Graph)
 385    ->  Graph = user
 386    ;   true
 387    ),
 388    rdf_transaction((  rdf_process_ntriples(Stream, assert_tuples, Options),
 389                       rdf_set_graph(Graph, modified(false))
 390                    ),
 391                    parse(Graph)).
 392
 393assert_tuples([], _).
 394assert_tuples([H|T], Graph) :-
 395    assert_tuple(H, Graph),
 396    assert_tuples(T, Graph).
 397
 398assert_tuple(rdf(S,P,O), Graph) :-
 399    rdf_assert(S,P,O,Graph).
 400assert_tuple(rdf(S,P,O,Graph), _) :-
 401    rdf_assert(S,P,O,Graph).
 402
 403
 404%!  rdf_db:rdf_file_type(+Extension, -Format)
 405%
 406%   Bind the ntriples reader to  files   with  the  extensions =nt=,
 407%   =ntriples= and =nquads=.
 408
 409rdf_db:rdf_file_type(nt,       ntriples).
 410rdf_db:rdf_file_type(ntriples, ntriples).
 411rdf_db:rdf_file_type(nq,       nquads).
 412rdf_db:rdf_file_type(nquads,   nquads).