View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2009-2015, VU University Amsterdam
   7    All rights reserved.
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions
  11    are met:
  12
  13    1. Redistributions of source code must retain the above copyright
  14       notice, this list of conditions and the following disclaimer.
  15
  16    2. Redistributions in binary form must reproduce the above copyright
  17       notice, this list of conditions and the following disclaimer in
  18       the documentation and/or other materials provided with the
  19       distribution.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32    POSSIBILITY OF SUCH DAMAGE.
  33*/
  34
  35:- module(uri,
  36          [ uri_components/2,           % ?URI, ?Components
  37            uri_data/3,                 % ?Field, +Components, ?Data
  38            uri_data/4,                 % +Field, +Components, -Data, -New
  39
  40            uri_normalized/2,           % +URI, -NormalizedURI
  41            iri_normalized/2,           % +IRI, -NormalizedIRI
  42            uri_normalized_iri/2,       % +URI, -NormalizedIRI
  43            uri_normalized/3,           % +URI, +Base, -NormalizedURI
  44            iri_normalized/3,           % +IRI, +Base, -NormalizedIRI
  45            uri_normalized_iri/3,       % +URI, +Base, -NormalizedIRI
  46            uri_resolve/3,              % +URI, +Base, -AbsURI
  47            uri_is_global/1,            % +URI
  48            uri_query_components/2,     % ?QueryString, ?NameValueList
  49            uri_authority_components/2, % ?Authority, ?Components
  50            uri_authority_data/3,       % ?Field, ?Components, ?Data
  51                                        % Encoding
  52            uri_encoded/3,              % +Component, ?Value, ?Encoded
  53            uri_file_name/2,            % ?URI, ?Path
  54            uri_iri/2                   % ?URI, ?IRI
  55          ]).
  56:- use_foreign_library(foreign(uri)).
  57
  58/** <module> Process URIs
  59
  60This  library  provides   high-performance    C-based   primitives   for
  61manipulating URIs. We decided for a  C-based implementation for the much
  62better performance on raw character  manipulation. Notably, URI handling
  63primitives are used in  time-critical  parts   of  RDF  processing. This
  64implementation is based on RFC-3986:
  65
  66        http://labs.apache.org/webarch/uri/rfc/rfc3986.html
  67
  68The URI processing in this library is  rather liberal. That is, we break
  69URIs according to the rules, but we  do not validate that the components
  70are valid. Also, percent-decoding for IRIs   is  liberal. It first tries
  71UTF-8; then ISO-Latin-1 and finally accepts %-characters verbatim.
  72
  73Earlier experience has shown that strict   enforcement of the URI syntax
  74results in many errors that  are   accepted  by  many other web-document
  75processing tools.
  76*/
  77
  78%!  uri_components(+URI, -Components) is det.
  79%!  uri_components(-URI, +Components) is det.
  80%
  81%   Break a URI  into  its  5   basic  components  according  to the
  82%   RFC-3986 regular expression:
  83%
  84%       ==
  85%       ^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?
  86%        12            3  4          5       6  7        8 9
  87%       ==
  88%
  89%   @param Components is a   term  uri_components(Scheme, Authority,
  90%   Path, Search, Fragment). If a URI  is *parsed*, i.e., using mode
  91%   (+,-), components that are not   found are left _uninstantiated_
  92%   (variable). See uri_data/3 for accessing this structure.
  93
  94%!  uri_data(?Field, +Components, ?Data) is semidet.
  95%
  96%   Provide access the uri_component structure.  Defined field-names
  97%   are: =scheme=, =authority=, =path=, =search= and =fragment=
  98
  99uri_data(scheme,    uri_components(S, _, _, _, _), S).
 100uri_data(authority, uri_components(_, A, _, _, _), A).
 101uri_data(path,      uri_components(_, _, P, _, _), P).
 102uri_data(search,    uri_components(_, _, _, S, _), S).
 103uri_data(fragment,  uri_components(_, _, _, _, F), F).
 104
 105%!  uri_data(+Field, +Components, +Data, -NewComponents) is semidet.
 106%
 107%   NewComponents is the same as Components with Field set to Data.
 108
 109uri_data(scheme,    uri_components(_, A, P, Q, F), S,
 110                    uri_components(S, A, P, Q, F)).
 111uri_data(authority, uri_components(S, _, P, Q, F), A,
 112                    uri_components(S, A, P, Q, F)).
 113uri_data(path,      uri_components(S, A, _, Q, F), P,
 114                    uri_components(S, A, P, Q, F)).
 115uri_data(search,    uri_components(S, A, P, _, F), Q,
 116                    uri_components(S, A, P, Q, F)).
 117uri_data(fragment,  uri_components(S, A, P, Q, _), F,
 118                    uri_components(S, A, P, Q, F)).
 119
 120%!  uri_normalized(+URI, -NormalizedURI) is det.
 121%
 122%   NormalizedURI is the normalized form   of  URI. Normalization is
 123%   syntactic and involves the following steps:
 124%
 125%       * 6.2.2.1. Case Normalization
 126%       * 6.2.2.2. Percent-Encoding Normalization
 127%       * 6.2.2.3. Path Segment Normalization
 128
 129%!  iri_normalized(+IRI, -NormalizedIRI) is det.
 130%
 131%   NormalizedIRI is the normalized form   of  IRI. Normalization is
 132%   syntactic and involves the following steps:
 133%
 134%       * 6.2.2.1. Case Normalization
 135%       * 6.2.2.3. Path Segment Normalization
 136%
 137%   @see    This is similar to uri_normalized/2, but does not do
 138%           normalization of %-escapes.
 139
 140%!  uri_normalized_iri(+URI, -NormalizedIRI) is det.
 141%
 142%   As uri_normalized/2, but percent-encoding is translated into IRI
 143%   Unicode characters. The translation  is   liberal:  valid  UTF-8
 144%   sequences  of  %-encoded  bytes  are    mapped  to  the  Unicode
 145%   character. Other %XX-sequences are mapped   to the corresponding
 146%   ISO-Latin-1 character and sole % characters are left untouched.
 147%
 148%   @see uri_iri/2.
 149
 150
 151%!  uri_is_global(+URI) is semidet.
 152%
 153%   True if URI has a scheme. The semantics  is the same as the code
 154%   below, but the implementation is more   efficient as it does not
 155%   need to parse the  other  components,   nor  needs  to  bind the
 156%   scheme.
 157%
 158%   ==
 159%   uri_is_global(URI) :-
 160%           uri_components(URI, Components),
 161%           uri_data(scheme, Components, Scheme),
 162%           nonvar(Scheme).
 163%   ==
 164
 165%!  uri_resolve(+URI, +Base, -GlobalURI) is det.
 166%
 167%   Resolve a possibly local URI relative   to Base. This implements
 168%   http://labs.apache.org/webarch/uri/rfc/rfc3986.html#relative-transform
 169
 170%!  uri_normalized(+URI, +Base, -NormalizedGlobalURI) is det.
 171%
 172%   NormalizedGlobalURI is the normalized global version of URI.
 173%   Behaves as if defined by:
 174%
 175%   ==
 176%   uri_normalized(URI, Base, NormalizedGlobalURI) :-
 177%           uri_resolve(URI, Base, GlobalURI),
 178%           uri_normalized(GlobalURI, NormalizedGlobalURI).
 179%   ==
 180
 181%!  iri_normalized(+IRI, +Base, -NormalizedGlobalIRI) is det.
 182%
 183%   NormalizedGlobalIRI is the normalized  global   version  of IRI.
 184%   This is similar to uri_normalized/3, but   does  not do %-escape
 185%   normalization.
 186
 187%!  uri_normalized_iri(+URI, +Base, -NormalizedGlobalIRI) is det.
 188%
 189%   NormalizedGlobalIRI is the normalized global IRI of URI. Behaves
 190%   as if defined by:
 191%
 192%   ==
 193%   uri_normalized(URI, Base, NormalizedGlobalIRI) :-
 194%           uri_resolve(URI, Base, GlobalURI),
 195%           uri_normalized_iri(GlobalURI, NormalizedGlobalIRI).
 196%   ==
 197
 198%!  uri_query_components(+String, -Query) is det.
 199%!  uri_query_components(-String, +Query) is det.
 200%
 201%   Perform encoding and decoding of an URI query string. Query is a
 202%   list of fully decoded (Unicode) Name=Value pairs. In mode (-,+),
 203%   query elements of the forms Name(Value)  and Name-Value are also
 204%   accepted to enhance interoperability with   the option and pairs
 205%   libraries.  E.g.
 206%
 207%   ==
 208%   ?- uri_query_components(QS, [a=b, c('d+w'), n-'VU Amsterdam']).
 209%   QS = 'a=b&c=d%2Bw&n=VU%20Amsterdam'.
 210%
 211%   ?- uri_query_components('a=b&c=d%2Bw&n=VU%20Amsterdam', Q).
 212%   Q = [a=b, c='d+w', n='VU Amsterdam'].
 213%   ==
 214
 215
 216%!  uri_authority_components(+Authority, -Components) is det.
 217%!  uri_authority_components(-Authority, +Components) is det.
 218%
 219%   Break-down the authority component of a   URI. The fields of the
 220%   structure Components can be accessed using uri_authority_data/3.
 221
 222%!  uri_authority_data(+Field, ?Components, ?Data) is semidet.
 223%
 224%   Provide access the uri_authority  structure. Defined field-names
 225%   are: =user=, =password=, =host= and =port=
 226
 227uri_authority_data(user,     uri_authority(U, _, _, _), U).
 228uri_authority_data(password, uri_authority(_, P, _, _), P).
 229uri_authority_data(host,     uri_authority(_, _, H, _), H).
 230uri_authority_data(port,     uri_authority(_, _, _, P), P).
 231
 232
 233%!  uri_encoded(+Component, +Value, -Encoded) is det.
 234%!  uri_encoded(+Component, -Value, +Encoded) is det.
 235%
 236%   Encoded  is  the  URI   encoding    for   Value.  When  encoding
 237%   (Value->Encoded), Component specifies the   URI  component where
 238%   the value is used. It  is   one  of =query_value=, =fragment= or
 239%   =path=.  Besides  alphanumerical  characters,    the   following
 240%   characters are passed verbatim (the  set   is  split  in logical
 241%   groups according to RFC3986).
 242%
 243%       $ query_value, fragment :
 244%       "-._~" | "!$'()*,;" | ":@" | "/?"
 245%       $ path :
 246%       "-._~" | "!$&'()*,;=" | ":@" | "/"
 247
 248
 249%!  uri_iri(+URI, -IRI) is det.
 250%!  uri_iri(-URI, +IRI) is det.
 251%
 252%   Convert between a URI, encoded in US-ASCII and an IRI. An IRI is
 253%   a fully expanded  Unicode  string.   Unicode  strings  are first
 254%   encoded into UTF-8, after which %-encoding takes place.
 255%
 256%   @error syntax_error(Culprit) in mode (+,-) if URI is not a
 257%   legally percent-encoded UTF-8 string.
 258
 259
 260%!  uri_file_name(+URI, -FileName) is semidet.
 261%!  uri_file_name(-URI, +FileName) is det.
 262%
 263%   Convert between a URI and a   local  file_name. This protocol is
 264%   covered by RFC 1738. Please note   that file-URIs use _absolute_
 265%   paths. The mode (-, +) translates  a possible relative path into
 266%   an absolute one.
 267
 268uri_file_name(URI, FileName) :-
 269    nonvar(URI),
 270    !,
 271    uri_components(URI, Components),
 272    uri_data(scheme, Components, File), File == file,
 273    (   uri_data(authority, Components, '')
 274    ->  true
 275    ;   uri_data(authority, Components, localhost)
 276    ),
 277    uri_data(path, Components, FileNameEnc),
 278    uri_encoded(path, FileName0, FileNameEnc),
 279    delete_leading_slash(FileName0, FileName).
 280uri_file_name(URI, FileName) :-
 281    nonvar(FileName),
 282    !,
 283    absolute_file_name(FileName, Path0),
 284    ensure_leading_slash(Path0, Path),
 285    uri_encoded(path, Path, PathEnc),
 286    uri_data(scheme, Components, file),
 287    uri_data(authority, Components, ''),
 288    uri_data(path, Components, PathEnc),
 289    uri_components(URI, Components).
 290
 291%!  ensure_leading_slash(+WinPath, -Path).
 292%!  delete_leading_slash(+Path, -WinPath).
 293%
 294%   Deal with the fact that absolute paths   in Windows start with a
 295%   drive letter rather than a  /.  For   URIs  we  need a path that
 296%   starts with a /.
 297
 298ensure_leading_slash(Path, SlashPath) :-
 299    (   sub_atom(Path, 0, _, _, /)
 300    ->  SlashPath = Path
 301    ;   atom_concat(/, Path, SlashPath)
 302    ).
 303
 304:- if(current_prolog_flag(windows, true)).
 305delete_leading_slash(Path, WinPath) :-
 306    atom_concat(/, WinPath, Path),
 307    is_absolute_file_name(WinPath),
 308    !.
 309:- endif.
 310delete_leading_slash(Path, Path).
 311
 312
 313                 /*******************************
 314                 *            SANDBOX           *
 315                 *******************************/
 316
 317:- multifile sandbox:safe_primitive/1.
 318
 319sandbox:safe_primitive(uri:uri_components(_,_)).
 320sandbox:safe_primitive(uri:uri_normalized(_,_)).
 321sandbox:safe_primitive(uri:iri_normalized(_,_)).
 322sandbox:safe_primitive(uri:uri_normalized_iri(_,_)).
 323sandbox:safe_primitive(uri:uri_normalized(_,_,_)).
 324sandbox:safe_primitive(uri:iri_normalized(_,_,_)).
 325sandbox:safe_primitive(uri:uri_normalized_iri(_,_,_)).
 326sandbox:safe_primitive(uri:uri_resolve(_,_,_)).
 327sandbox:safe_primitive(uri:uri_is_global(_)).
 328sandbox:safe_primitive(uri:uri_query_components(_,_)).
 329sandbox:safe_primitive(uri:uri_authority_components(_,_)).
 330sandbox:safe_primitive(uri:uri_encoded(_,_,_)).
 331sandbox:safe_primitive(uri:uri_iri(_,_)).