1/* Part of SWI-Prolog 2 3 Author: Jan Wielemaker 4 E-mail: J.Wielemaker@vu.nl 5 WWW: http://www.swi-prolog.org 6 Copyright (c) 2006-2015, University of Amsterdam 7 VU University Amsterdam 8 All rights reserved. 9 10 Redistribution and use in source and binary forms, with or without 11 modification, are permitted provided that the following conditions 12 are met: 13 14 1. Redistributions of source code must retain the above copyright 15 notice, this list of conditions and the following disclaimer. 16 17 2. Redistributions in binary form must reproduce the above copyright 18 notice, this list of conditions and the following disclaimer in 19 the documentation and/or other materials provided with the 20 distribution. 21 22 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 23 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 24 LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 25 FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 26 COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 27 INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 28 BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 29 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 30 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 31 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 32 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 33 POSSIBILITY OF SUCH DAMAGE. 34*/ 35 36:- module(rdf_http_plugin, []). 37:- use_module(library(http/http_open)). 38:- use_module(library(http/http_header)). 39:- use_module(library(semweb/rdf_db), []). % we define hooks for this 40:- use_module(library(date)). 41:- use_module(library(error)). 42:- use_module(library(lists)). 43:- use_module(library(option)). 44 45/** <module> RDF HTTP Plugin 46 47This module allows loading data into the semantic web library directly 48from an HTTP server. The following example loads the RDF core data into 49the RDF database. 50 51 == 52 :- use_module(library(semweb/rdf_db)). 53 :- use_module(library(semweb/rdf_http_plugin)). 54 55 ..., 56 rdf_load('http://www.w3.org/1999/02/22-rdf-syntax-ns') 57 == 58*/ 59 60:- multifile 61 rdf_db:rdf_open_hook/8, 62 rdf_db:url_protocol/1, 63 rdf_content_type/3. 64 65rdf_dburl_protocol(http). 66rdf_dburl_protocol(https). 67 68 69% define `rdf_format` as a type. 70:- multifile error:has_type/2. 71error:has_type(rdf_format, Term):- 72 error:has_type(oneof([nquads,ntriples,rdfa,trig,turtle,xml]), Term). 73 74%! rdf_extra_headers(-RequestHeaders:list(compound), +Options:list) is det. 75% 76% Send extra headers with the request. Note that, although we also 77% process RDF embedded in HTML, we do not explicitely ask for it. 78% Doing so causes some (e.g., http://w3.org/2004/02/skos/core to 79% reply with the HTML description rather than the RDF). 80% 81% When given, option format(+atom) is used in order to prioritize 82% the corresponding RDF content types. 83 84rdf_extra_headers([ cert_verify_hook(ssl_verify), 85 request_header('Accept'=AcceptValue) 86 ], Options) :- 87 option(format(Format), Options, _VAR), 88 rdf_accept_header_value(Format, AcceptValue). 89 90 91%! rdf_db:rdf_open_hook(+Scheme, +URL, +HaveModified, 92%! -Stream, -Cleanup, -Modified, -Format, 93%! +Options) is semidet. 94% 95% Load hook implementation for HTTP(S) URLs. 96% 97% @arg HaveModified is bound to a timestamp (number) if we already 98% have a copy and that copy was modified at HaveModified. 99% @arg Modified is bound to =unknown=, =not_modified= or a 100% timestamp. 101 102rdf_db:rdf_open_hook(https, SourceURL, HaveModified, Stream, Cleanup, 103 Modified, Format, Options) :- 104 rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup, 105 Modified, Format, Options). 106rdf_db:rdf_open_hook(http, SourceURL, HaveModified, Stream, Cleanup, 107 Modified, Format, Options) :- 108 modified_since_header(HaveModified, Header), 109 TypeHdr = [ header(content_type, ContentType), 110 header(last_modified, ModifiedText) 111 ], 112 rdf_extra_headers(Extra, Options), 113 append([Extra, TypeHdr, Header, Options], OpenOptions), 114 catch(http_open(SourceURL, Stream0, 115 [ status_code(Code) 116 | OpenOptions 117 ]), E, true), 118 ( Code == 200 119 -> ( open_envelope(ContentType, SourceURL, 120 Stream0, Stream, Format) 121 -> Cleanup = close(Stream), 122 ( nonvar(ModifiedText), 123 parse_time(ModifiedText, ModifiedStamp) 124 -> Modified = last_modified(ModifiedStamp) 125 ; Modified = unknown 126 ) 127 ; close(Stream0), 128 domain_error(content_type, ContentType) 129 ) 130 ; Code == 304 131 -> Modified = not_modified, 132 Cleanup = true 133 ; var(E) 134 -> throw(error(existence_error(url, SourceURL), 135 context(_, status(Code,_)))) 136 ; throw(E) 137 ). 138 139:- public ssl_verify/5. 140 141%! ssl_verify(+SSL, +ProblemCert, +AllCerts, +FirstCert, +Error) 142% 143% Currently we accept all certificates. 144 145ssl_verify(_SSL, 146 _ProblemCertificate, _AllCertificates, _FirstCertificate, 147 _Error). 148 149%! modified_since_header(+LastModified, -ExtraHeaders) is det. 150% 151% Add an =|If-modified-since|= if we have a version with the given 152% time-stamp. 153 154modified_since_header(HaveModified, []) :- 155 var(HaveModified), 156 !. 157modified_since_header(HaveModified, 158 [ request_header('If-modified-since' = 159 Modified) 160 ]) :- 161 http_timestamp(HaveModified, Modified). 162 163%! open_envelope(+ContentType, +SourceURL, +Stream0, -Stream, 164%! ?Format) is semidet. 165% 166% Open possible envelope formats. 167 168open_envelope('application/x-gzip', SourceURL, Stream0, Stream, Format) :- 169 rdf_db:rdf_storage_encoding(_, gzip), 170 !, 171 ( var(Format) 172 -> file_name_extension(BaseURL, _GzExt, SourceURL), 173 file_name_extension(_, Ext, BaseURL), 174 rdf_db:rdf_file_type(Ext, Format) 175 ; true 176 ), 177 rdf_zlib_plugin:zopen(Stream0, Stream, []). 178open_envelope(_, _, Stream, Stream, Format) :- 179 nonvar(Format), 180 !. 181open_envelope(ContentType, SourceURL, Stream, Stream, Format) :- 182 major_content_type(ContentType, Major), 183 ( rdf_content_type(Major, _, Format) 184 -> true 185 ; Major == 'text/plain' % server is not properly configured 186 -> file_name_extension(_, Ext, SourceURL), 187 rdf_db:rdf_file_type(Ext, Format) 188 ). 189 190major_content_type(ContentType, Major) :- 191 sub_atom(ContentType, Pre, _, _, (;)), 192 !, 193 sub_atom(ContentType, 0, Pre, _, Major). 194major_content_type(Major, Major). 195 196 197%% rdf_accept_header_value(?Format:rdf_format, -AcceptValue:atom) is det. 198 199rdf_accept_header_value(Format, AcceptValue) :- 200 findall(AcceptValue, accept_value(Format, AcceptValue), AcceptValues), 201 atomic_list_concat(['*/*;q=0.001'|AcceptValues], ',', AcceptValue). 202 203accept_value(Format, AcceptValue) :- 204 rdf_content_type(MediaType, QValue0, Format0), 205 ( Format == Format0 206 -> QValue = 1.0 207 ; QValue = QValue0 208 ), 209 format(atom(AcceptValue), '~a;q=~3f', [MediaType,QValue]). 210 211 212%! rdf_content_type(?MediaType:atom, ?QualityValue:between(0.0,1.0), 213%! ?Format:rdf_format) is nondet. 214% 215% Quality values are intended to be used in accordance with RFC 216% 2616. Quality values are determined based on the following 217% criteria: 218% 219% | **Label** | **Criterion** | **Value** | 220% | A | Supported RDF parser | 0.43 | 221% | B | RDF-specific content type | 0.33 | 222% | C | Official content type | 0.23 | 223% 224% For example, `text/turtle` has quality value 0.99 because it is 225% an official content type that is RDF-specific and that has a parser 226% in Semweb. 227% 228% This intentionally allows the user to add another content type with 229% a higher Q-value (i.e., >0.99). 230% 231% Deduce the RDF encoding from the mime-type. This predicate is 232% defined as multifile such that the user can associate additional 233% content types to RDF formats. 234% 235% @bug The turtle parser only parses a subset of n3. 236% (The N3 format is treated as if it were Turtle.) 237% @see Discussion http://richard.cyganiak.de/blog/2008/03/what-is-your-rdf-browsers-accept-header/ 238% @see N-Quadruples http://www.w3.org/ns/formats/N-Quads 239% @see N-Triples http://www.w3.org/ns/formats/N-Triples 240% @see N3 http://www.w3.org/ns/formats/N3 241% @see RDFa http://www.w3.org/ns/formats/RDFa 242% @see TriG http://www.w3.org/ns/formats/TriG 243% @see Turtle http://www.w3.org/ns/formats/Turtle 244% @see XML/RDF http://www.w3.org/ns/formats/RDF_XML 245 246rdf_content_type('application/n-quads', 0.99, nquads ). %ABC 247rdf_content_type('application/n-triples', 0.99, ntriples). %ABC 248rdf_content_type('application/rdf', 0.76, xml ). %AB 249rdf_content_type('application/rdf+turtle', 0.76, turtle ). %AB 250rdf_content_type('application/rdf+xml', 0.76, xml ). %AB 251rdf_content_type('application/rss+xml', 0.66, xml ). %AC 252rdf_content_type('application/trig', 0.99, trig ). %ABC 253rdf_content_type('application/turtle', 0.99, turtle ). %ABC 254rdf_content_type('application/x-trig', 0.76, trig ). %AB 255rdf_content_type('application/x-turtle', 0.76, turtle ). %AB 256rdf_content_type('application/xhtml+xml', 0.66, rdfa ). %AC 257rdf_content_type('application/xml', 0.66, xml ). %AC 258rdf_content_type('text/html', 0.66, rdfa ). %AC 259rdf_content_type('text/n3', 0.56, turtle ). %BC (N3) 260rdf_content_type('text/rdf', 0.76, xml ). %AB 261rdf_content_type('text/rdf+n3', 0.33, turtle ). %B (N3) 262rdf_content_type('text/rdf+xml', 0.76, xml ). %AB 263rdf_content_type('text/turtle', 0.76, turtle ). %AB 264rdf_content_type('text/xml', 0.66, xml ). %AC 265rdf_content_type('application/x-gzip', 0.23, gzip ). %C