View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2006-2015, University of Amsterdam
   7                              VU University Amsterdam
   8    All rights reserved.
   9
  10    Redistribution and use in source and binary forms, with or without
  11    modification, are permitted provided that the following conditions
  12    are met:
  13
  14    1. Redistributions of source code must retain the above copyright
  15       notice, this list of conditions and the following disclaimer.
  16
  17    2. Redistributions in binary form must reproduce the above copyright
  18       notice, this list of conditions and the following disclaimer in
  19       the documentation and/or other materials provided with the
  20       distribution.
  21
  22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33    POSSIBILITY OF SUCH DAMAGE.
  34*/
  35
  36:- module(rdf_http_plugin, []).
  37:- use_module(library(http/http_open)).
  38:- use_module(library(http/http_header)).
  39:- use_module(library(semweb/rdf_db), []). % we define hooks for this
  40:- use_module(library(date)).
  41:- use_module(library(error)).
  42:- use_module(library(lists)).
  43:- use_module(library(option)).
  44
  45/** <module> RDF HTTP Plugin
  46
  47This module allows loading data into   the semantic web library directly
  48from an HTTP server. The following example  loads the RDF core data into
  49the RDF database.
  50
  51    ==
  52    :- use_module(library(semweb/rdf_db)).
  53    :- use_module(library(semweb/rdf_http_plugin)).
  54
  55        ...,
  56        rdf_load('http://www.w3.org/1999/02/22-rdf-syntax-ns')
  57    ==
  58*/
  59
  60:- multifile
  61    rdf_db:rdf_open_hook/8,
  62    rdf_db:url_protocol/1,
  63    rdf_content_type/3.
  64
  65rdf_db:url_protocol(http).
  66rdf_db:url_protocol(https).
  67
  68
  69% define `rdf_format` as a type.
  70:- multifile error:has_type/2.
  71error:has_type(rdf_format, Term):-
  72    error:has_type(oneof([nquads,ntriples,rdfa,trig,turtle,xml]), Term).
  73
  74%!  rdf_extra_headers(-RequestHeaders:list(compound), +Options:list) is det.
  75%
  76%   Send extra headers with the request. Note that, although we also
  77%   process RDF embedded in HTML, we do  not explicitely ask for it.
  78%   Doing so causes some   (e.g., http://w3.org/2004/02/skos/core to
  79%   reply with the HTML description rather than the RDF).
  80%
  81%   When given, option format(+atom) is used in order to prioritize
  82%   the corresponding RDF content types.
  83
  84rdf_extra_headers([ cert_verify_hook(ssl_verify),
  85                    request_header('Accept'=AcceptValue)
  86                  ], Options) :-
  87    option(format(Format), Options, _VAR),
  88    rdf_accept_header_value(Format, AcceptValue).
  89
  90
  91%!  rdf_db:rdf_open_hook(+Scheme, +URL, +HaveModified,
  92%!                       -Stream, -Cleanup, -Modified, -Format,
  93%!                       +Options) is semidet.
  94%
  95%   Load hook implementation for HTTP(S) URLs.
  96%
  97%   @arg HaveModified is bound to a timestamp (number) if we already
  98%        have a copy and that copy was modified at HaveModified.
  99%   @arg Modified is bound to =unknown=, =not_modified= or a
 100%        timestamp.
 101
 102rdf_db:rdf_open_hook(https, SourceURL, HaveModified, Stream, Cleanup,
 103                     Modified, Format, Options) :-
 104    rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup,
 105                         Modified, Format, Options).
 106rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup,
 107                     Modified, Format, Options) :-
 108    modified_since_header(HaveModified, Header),
 109    TypeHdr = [ header(content_type, ContentType),
 110                header(last_modified, ModifiedText)
 111              ],
 112    rdf_extra_headers(Extra, Options),
 113    append([Extra, TypeHdr, Header, Options], OpenOptions),
 114    catch(http_open(SourceURL, Stream0,
 115                    [ status_code(Code)
 116                    | OpenOptions
 117                    ]), E, true),
 118    (   Code == 200
 119    ->  (   open_envelope(ContentType, SourceURL,
 120                          Stream0, Stream, Format)
 121        ->  Cleanup = close(Stream),
 122            (   nonvar(ModifiedText),
 123                parse_time(ModifiedText, ModifiedStamp)
 124            ->  Modified = last_modified(ModifiedStamp)
 125            ;   Modified = unknown
 126            )
 127        ;   close(Stream0),
 128            domain_error(content_type, ContentType)
 129        )
 130    ;   Code == 304
 131    ->  Modified = not_modified,
 132        Cleanup = true
 133    ;   var(E)
 134    ->  throw(error(existence_error(url, SourceURL),
 135                    context(_, status(Code,_))))
 136    ;   throw(E)
 137    ).
 138
 139:- public ssl_verify/5.
 140
 141%!  ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error)
 142%
 143%   Currently we accept  all  certificates.
 144
 145ssl_verify(_SSL,
 146           _ProblemCertificate, _AllCertificates, _FirstCertificate,
 147           _Error).
 148
 149%!  modified_since_header(+LastModified, -ExtraHeaders) is det.
 150%
 151%   Add an =|If-modified-since|= if we have a version with the given
 152%   time-stamp.
 153
 154modified_since_header(HaveModified, []) :-
 155    var(HaveModified),
 156    !.
 157modified_since_header(HaveModified,
 158                      [ request_header('If-modified-since' =
 159                                       Modified)
 160                      ]) :-
 161    http_timestamp(HaveModified, Modified).
 162
 163%!  open_envelope(+ContentType, +SourceURL, +Stream0, -Stream,
 164%!                ?Format) is semidet.
 165%
 166%   Open possible envelope formats.
 167
 168open_envelope('application/x-gzip', SourceURL, Stream0, Stream, Format) :-
 169    rdf_db:rdf_storage_encoding(_, gzip),
 170    !,
 171    (   var(Format)
 172    ->  file_name_extension(BaseURL, _GzExt, SourceURL),
 173        file_name_extension(_, Ext, BaseURL),
 174        rdf_db:rdf_file_type(Ext, Format)
 175    ;   true
 176    ),
 177    rdf_zlib_plugin:zopen(Stream0, Stream, []).
 178open_envelope(_, _, Stream, Stream, Format) :-
 179    nonvar(Format),
 180    !.
 181open_envelope(ContentType, SourceURL, Stream, Stream, Format) :-
 182    major_content_type(ContentType, Major),
 183    (   rdf_content_type(Major, _, Format)
 184    ->  true
 185    ;   Major == 'text/plain'       % server is not properly configured
 186    ->  file_name_extension(_, Ext, SourceURL),
 187        rdf_db:rdf_file_type(Ext, Format)
 188    ).
 189
 190major_content_type(ContentType, Major) :-
 191    sub_atom(ContentType, Pre, _, _, (;)),
 192    !,
 193    sub_atom(ContentType, 0, Pre, _, Major).
 194major_content_type(Major, Major).
 195
 196
 197%% rdf_accept_header_value(?Format:rdf_format, -AcceptValue:atom) is det.
 198
 199rdf_accept_header_value(Format, AcceptValue) :-
 200    findall(AcceptValue, accept_value(Format, AcceptValue), AcceptValues),
 201    atomic_list_concat(['*/*;q=0.001'|AcceptValues], ',', AcceptValue).
 202
 203accept_value(Format, AcceptValue) :-
 204    rdf_content_type(MediaType, QValue0, Format0),
 205    (   Format == Format0
 206    ->  QValue = 1.0
 207    ;   QValue = QValue0
 208    ),
 209    format(atom(AcceptValue), '~a;q=~3f', [MediaType,QValue]).
 210
 211
 212%!  rdf_content_type(?MediaType:atom, ?QualityValue:between(0.0,1.0),
 213%!                   ?Format:rdf_format) is nondet.
 214%
 215%   Quality values are intended to be   used  in accordance with RFC
 216%   2616. Quality values  are  determined   based  on  the following
 217%   criteria:
 218%
 219%       | **Label** | **Criterion**             | **Value** |
 220%       | A         | Supported RDF parser      | 0.43      |
 221%       | B         | RDF-specific content type | 0.33      |
 222%       | C         | Official content type     | 0.23      |
 223%
 224%   For example, `text/turtle` has quality value 0.99 because it is
 225%   an official content type that is RDF-specific and that has a parser
 226%   in Semweb.
 227%
 228%   This intentionally allows the user to add another content type with
 229%   a higher Q-value (i.e., >0.99).
 230%
 231%   Deduce the RDF encoding from the   mime-type.  This predicate is
 232%   defined as multifile such that the user can associate additional
 233%   content types to RDF formats.
 234%
 235%   @bug The turtle parser only parses a subset of n3.
 236%        (The N3 format is treated as if it were Turtle.)
 237%   @see Discussion http://richard.cyganiak.de/blog/2008/03/what-is-your-rdf-browsers-accept-header/
 238%   @see N-Quadruples http://www.w3.org/ns/formats/N-Quads
 239%   @see N-Triples http://www.w3.org/ns/formats/N-Triples
 240%   @see N3 http://www.w3.org/ns/formats/N3
 241%   @see RDFa http://www.w3.org/ns/formats/RDFa
 242%   @see TriG http://www.w3.org/ns/formats/TriG
 243%   @see Turtle http://www.w3.org/ns/formats/Turtle
 244%   @see XML/RDF http://www.w3.org/ns/formats/RDF_XML
 245
 246rdf_content_type('application/n-quads',    0.99, nquads  ). %ABC
 247rdf_content_type('application/n-triples',  0.99, ntriples). %ABC
 248rdf_content_type('application/rdf',        0.76, xml     ). %AB
 249rdf_content_type('application/rdf+turtle', 0.76, turtle  ). %AB
 250rdf_content_type('application/rdf+xml',    0.76, xml     ). %AB
 251rdf_content_type('application/rss+xml',    0.66, xml     ). %AC
 252rdf_content_type('application/trig',       0.99, trig    ). %ABC
 253rdf_content_type('application/turtle',     0.99, turtle  ). %ABC
 254rdf_content_type('application/x-trig',     0.76, trig    ). %AB
 255rdf_content_type('application/x-turtle',   0.76, turtle  ). %AB
 256rdf_content_type('application/xhtml+xml',  0.66, rdfa    ). %AC
 257rdf_content_type('application/xml',        0.66, xml     ). %AC
 258rdf_content_type('text/html',              0.66, rdfa    ). %AC
 259rdf_content_type('text/n3',                0.56, turtle  ). %BC (N3)
 260rdf_content_type('text/rdf',               0.76, xml     ). %AB
 261rdf_content_type('text/rdf+n3',            0.33, turtle  ). %B (N3)
 262rdf_content_type('text/rdf+xml',           0.76, xml     ). %AB
 263rdf_content_type('text/turtle',            0.76, turtle  ). %AB
 264rdf_content_type('text/xml',               0.66, xml     ). %AC
 265rdf_content_type('application/x-gzip',     0.23, gzip    ). %C