url.pl

View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2000-2015, University of Amsterdam
   7    All rights reserved.
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions
  11    are met:
  12
  13    1. Redistributions of source code must retain the above copyright
  14       notice, this list of conditions and the following disclaimer.
  15
  16    2. Redistributions in binary form must reproduce the above copyright
  17       notice, this list of conditions and the following disclaimer in
  18       the documentation and/or other materials provided with the
  19       distribution.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32    POSSIBILITY OF SUCH DAMAGE.
  33*/
  34
  35:- module(url,
  36          [ parse_url/2,                % +URL, -Parts | -URL +Parts
  37            parse_url/3,                % +URL|URI, +BaseURL, -Parts
  38                                        % -URL, +BaseURL, +Parts
  39            is_absolute_url/1,          % +URL
  40            global_url/3,               % +Local, +Base, -Global
  41            http_location/2,            % ?Parts, ?Location
  42            www_form_encode/2,          % Value <-> Encoded
  43            parse_url_search/2,         % Form-data <-> Form fields
  44
  45            url_iri/2,                  % ?URL, ?IRI
  46
  47            file_name_to_url/2,         % ?FileName, ?URL
  48
  49            set_url_encoding/2          % ?Old, +New
  50          ]).
  51:- use_module(library(lists)).
  52:- use_module(library(error)).
  53:- use_module(library(utf8)).
  54
  55/** <module> Analysing and constructing URL
  56
  57This library deals with the analysis and construction of a URL,
  58Universal Resource Locator. URL is the basis for communicating locations
  59of resources (data) on the web. A URL consists of a protocol identifier
  60(e.g. HTTP, FTP, and a protocol-specific syntax further defining the
  61location. URLs are standardized in RFC-1738.
  62
  63The implementation in this library covers only a small portion of the
  64defined protocols.  Though the initial implementation followed RFC-1738
  65strictly, the current is more relaxed to deal with frequent violations
  66of the standard encountered in practical use.
  67
  68@author Jan Wielemaker
  69@author Lukas Faulstich
  70@deprecated New code should use library(uri), provided by the =clib=
  71            package.
  72*/
  73
  74                 /*******************************
  75                 *            GLOBALISE         *
  76                 *******************************/
  77
  78%!  global_url(+URL, +Base, -Global) is det.
  79%
  80%   Translate a possibly relative URL  into   an  absolute  one.
  81%
  82%   @error syntax_error(illegal_url) if URL is not legal.
  83
  84global_url(URL, BaseURL, Global) :-
  85    (   is_absolute_url(URL),
  86        \+ sub_atom(URL, _, _, _, '%')      % may have escape, use general
  87    ->  Global = URL
  88    ;   sub_atom(URL, 0, _, _, '//')
  89    ->  parse_url(BaseURL, [], Attributes),
  90        memberchk(protocol(Proto), Attributes),
  91        atomic_list_concat([Proto, :, URL], Global)
  92    ;   sub_atom(URL, 0, _, _, #)
  93    ->  (   sub_atom(BaseURL, _, _, 0, #)
  94        ->  sub_atom(URL, 1, _, 0, NoHash),
  95            atom_concat(BaseURL, NoHash, Global)
  96        ;   atom_concat(BaseURL, URL, Global)
  97        )
  98    ;   parse_url(URL, BaseURL, Attributes)
  99    ->  phrase(curl(Attributes), Chars),
 100        atom_codes(Global, Chars)
 101    ;   throw(error(syntax_error(illegal_url), URL))
 102    ).
 103
 104%!  is_absolute_url(+URL)
 105%
 106%   True if URL is an absolute URL. That  is, a URL that starts with
 107%   a protocol identifier.
 108
 109is_absolute_url(URL) :-
 110    sub_atom(URL, 0, _, _, 'http://'),
 111    !.
 112is_absolute_url(URL) :-
 113    sub_atom(URL, 0, _, _, 'https://'),
 114    !.
 115is_absolute_url(URL) :-
 116    sub_atom(URL, 0, _, _, 'ftp://'),
 117    !.
 118is_absolute_url(URL) :-
 119    sub_atom(URL, 0, _, _, 'file://'),
 120    !.
 121is_absolute_url(URL) :-
 122    atom_codes(URL, Codes),
 123    phrase(absolute_url, Codes, _),
 124    !.
 125
 126
 127                 /*******************************
 128                 *        CREATE URL/URI        *
 129                 *******************************/
 130
 131%!  http_location(?Parts, ?Location)
 132%
 133%   Construct or analyze an  HTTP  location.   This  is  similar  to
 134%   parse_url/2, but only deals with the   location  part of an HTTP
 135%   URL. That is, the path, search   and fragment specifiers. In the
 136%   HTTP protocol, the first line of a message is
 137%
 138%       ==
 139%       <Action> <Location> HTTP/<version>
 140%       ==
 141%
 142%   @param Location Atom or list of character codes.
 143
 144http_location(Parts, Location) :-       % Parts --> Location
 145    nonvar(Parts),
 146    !,
 147    phrase(curi(Parts), String),
 148    !,
 149    atom_codes(Location, String).
 150http_location(Parts, Location) :-       % Location --> Parts
 151    atom(Location),
 152    !,
 153    atom_codes(Location, Codes),
 154    phrase(http_location(Parts), Codes).
 155http_location(Parts, Codes) :-          % LocationCodes --> Parts
 156    is_list(Codes),
 157    phrase(http_location(Parts), Codes).
 158
 159
 160curl(A) -->
 161    { memberchk(protocol(Protocol), A)
 162    },
 163    !,
 164    catomic(Protocol),
 165    ":",
 166    curl(Protocol, A).
 167curl(A) -->
 168    curl(http, A).
 169
 170curl(file, A) -->
 171    !,
 172    (   "//"
 173    ->  cpath(A)
 174    ;   cpath(A)
 175    ).
 176curl(_, A) -->
 177    "//",
 178    cuser(A),
 179    chost(A),
 180    cport(A),
 181    cpath(A),
 182    csearch(A),
 183    cfragment(A).
 184
 185curi(A) -->
 186    cpath(A),
 187    csearch(A).
 188
 189cpath(A) -->
 190    (   { memberchk(path(Path), A) }
 191    ->  { atom_codes(Path, Codes) },
 192        www_encode(Codes, [0'/, 0'+, 0':, 0',])
 193    ;   ""
 194    ).
 195
 196cuser(A) -->
 197    (   { memberchk(user(User), A) }
 198    ->  { atom_codes(User, Codes) },
 199        www_encode(Codes, [0':]),
 200        "@"
 201    ;   ""
 202    ).
 203
 204chost(A) -->
 205    (   { memberchk(host(Host), A) }
 206    ->  { atom_codes(Host, Codes) },
 207        www_encode(Codes, [])
 208    ;   ""
 209    ).
 210
 211cport(A) -->
 212    (   { memberchk(port(Port), A), Port \== 80 }
 213    ->  { number_codes(Port, Codes) },
 214        ":",
 215        www_encode(Codes, [])
 216    ;   ""
 217    ).
 218
 219
 220catomic(A, In, Out) :-
 221    atom_codes(A, Codes),
 222    append(Codes, Out, In).
 223
 224%!  csearch(+Attributes)//
 225
 226csearch(A)-->
 227    (   { memberchk(search(Parameters), A) }
 228    ->  csearch(Parameters, [0'?])
 229    ;   []
 230    ).
 231
 232csearch([], _) -->
 233    [].
 234csearch([Parameter|Parameters], Sep) -->
 235    !,
 236    codes(Sep),
 237    cparam(Parameter),
 238    csearch(Parameters, [0'&]).
 239
 240cparam(Name=Value) -->
 241    !,
 242    cname(Name),
 243    "=",
 244    cvalue(Value).
 245cparam(NameValue) -->                   % allow to feed Name(Value)
 246    { compound(NameValue),
 247      !,
 248      NameValue =.. [Name,Value]
 249    },
 250    cname(Name),
 251    "=",
 252    cvalue(Value).
 253cparam(Name)-->
 254    cname(Name).
 255
 256codes([]) --> [].
 257codes([H|T]) --> [H], codes(T).
 258
 259cname(Atom) -->
 260    { atom_codes(Atom, Codes) },
 261    www_encode(Codes, []).
 262
 263%!  cvalue(+Value)// is det.
 264%
 265%   Construct a string from  Value.  Value   is  either  atomic or a
 266%   code-list.
 267
 268cvalue(Value) -->
 269    { atomic(Value),
 270      !,
 271      atom_codes(Value, Codes)
 272    },
 273    www_encode(Codes, []).
 274cvalue(Codes) -->
 275    { must_be(codes, Codes)
 276    },
 277    www_encode(Codes, []).
 278
 279
 280%!  cfragment(+Attributes)//
 281
 282cfragment(A) -->
 283    { memberchk(fragment(Frag), A),
 284      !,
 285      atom_codes(Frag, Codes)
 286    },
 287    "#",
 288    www_encode(Codes, []).
 289cfragment(_) -->
 290    "".
 291
 292
 293                 /*******************************
 294                 *            PARSING           *
 295                 *******************************/
 296
 297%!  parse_url(?URL, ?Attributes) is det.
 298%
 299%   Construct or analyse a URL. URL is an   atom  holding a URL or a
 300%   variable. Attributes is a list of  components. Each component is
 301%   of the format Name(Value). Defined components are:
 302%
 303%       * protocol(Protocol)
 304%       The used protocol. This is, after  the optional =|url:|=, an
 305%       identifier separated from the remainder of  the URL using :.
 306%       parse_url/2 assumes the =http= protocol   if  no protocol is
 307%       specified and the URL can be parsed  as a valid HTTP url. In
 308%       addition to the RFC-1738  specified   protocols,  the =file=
 309%       protocol is supported as well.
 310%
 311%       * host(Host)
 312%       Host-name or IP-address on which   the  resource is located.
 313%       Supported by all network-based protocols.
 314%
 315%       * port(Port)
 316%       Integer port-number to access on   the \arg{Host}. This only
 317%       appears if the port is  explicitly   specified  in  the URL.
 318%       Implicit default ports (e.g., 80 for   HTTP) do _not_ appear
 319%       in the part-list.
 320%
 321%       * path(Path)
 322%       (File-) path addressed by the URL. This is supported for the
 323%       =ftp=, =http= and =file= protocols. If  no path appears, the
 324%       library generates the path =|/|=.
 325%
 326%       * search(ListOfNameValue)
 327%       Search-specification of HTTP URL. This is the part after the
 328%       =|?|=, normally used to transfer data   from HTML forms that
 329%       use the =GET=  protocol.  In  the   URL  it  consists  of  a
 330%       www-form-encoded list of Name=Value pairs. This is mapped to
 331%       a list of Prolog Name=Value  terms   with  decoded names and
 332%       values.
 333%
 334%       * fragment(Fragment)
 335%       Fragment specification of HTTP URL. This   is the part after
 336%       the =|#|= character.
 337%
 338%   The example below illustrates all of this for an HTTP URL.
 339%
 340%       ==
 341%       ?- parse_url('http://www.xyz.org/hello?msg=Hello+World%21#x',
 342%              P).
 343%
 344%       P = [ protocol(http),
 345%             host('www.xyz.org'),
 346%             fragment(x),
 347%             search([ msg = 'Hello World!'
 348%                    ]),
 349%             path('/hello')
 350%           ]
 351%       ==
 352%
 353%   By instantiating the parts-list this predicate   can  be used to
 354%   create a URL.
 355
 356parse_url(URL, Attributes) :-
 357    nonvar(URL),
 358    !,
 359    atom_codes(URL, Codes),
 360    phrase(url(Attributes), Codes).
 361parse_url(URL, Attributes) :-
 362    phrase(curl(Attributes), Codes),
 363    !,
 364    atom_codes(URL, Codes).
 365
 366%!  parse_url(+URL, +BaseURL, -Attributes) is det.
 367%
 368%   Similar to parse_url/2 for relative URLs.  If URL is relative,
 369%   it is resolved using the absolute URL BaseURL.
 370
 371parse_url(URL, BaseURL, Attributes) :-
 372    nonvar(URL),
 373    !,
 374    atom_codes(URL, Codes),
 375    (   phrase(absolute_url, Codes, _)
 376    ->  phrase(url(Attributes), Codes)
 377    ;   (   atomic(BaseURL)
 378        ->  parse_url(BaseURL, BaseA0)
 379        ;   BaseA0 = BaseURL
 380        ),
 381        select(path(BasePath), BaseA0, BaseA1),
 382        delete(BaseA1, search(_), BaseA2),
 383        delete(BaseA2, fragment(_), BaseA3),
 384        phrase(relative_uri(URIA0), Codes),
 385        select(path(LocalPath), URIA0, URIA1),
 386        !,
 387        globalise_path(LocalPath, BasePath, Path),
 388        append(BaseA3, [path(Path)|URIA1], Attributes)
 389    ).
 390parse_url(URL, BaseURL, Attributes) :-
 391    parse_url(BaseURL, BaseAttributes),
 392    memberchk(path(BasePath), BaseAttributes),
 393    (   memberchk(path(LocalPath), Attributes)
 394    ->  globalise_path(LocalPath, BasePath, Path)
 395    ;   Path = BasePath
 396    ),
 397    append([path(Path)|Attributes], BaseAttributes, GlobalAttributes),
 398    phrase(curl(GlobalAttributes), Chars),
 399    atom_codes(URL, Chars).
 400
 401
 402%!  globalise_path(+LocalPath, +RelativeTo, -FullPath) is det.
 403%
 404%   The first clause deals with the  standard URL /... global paths.
 405%   The second with file://drive:path on MS-Windows.   This is a bit
 406%   of a cludge, but unfortunately common practice is -especially on
 407%   Windows- not always following the standard
 408
 409globalise_path(LocalPath, _, LocalPath) :-
 410    sub_atom(LocalPath, 0, _, _, /),
 411    !.
 412globalise_path(LocalPath, _, LocalPath) :-
 413    is_absolute_file_name(LocalPath),
 414    !.
 415globalise_path(Local, Base, Path) :-
 416    base_dir(Base, BaseDir),
 417    make_path(BaseDir, Local, Path).
 418
 419base_dir(BasePath, BaseDir) :-
 420    (   atom_concat(BaseDir, /, BasePath)
 421    ->  true
 422    ;   file_directory_name(BasePath, BaseDir)
 423    ).
 424
 425make_path(Dir, Local, Path) :-
 426    atom_concat('../', L2, Local),
 427    file_directory_name(Dir, Parent),
 428    Parent \== Dir,
 429    !,
 430    make_path(Parent, L2, Path).
 431make_path(/, Local, Path) :-
 432    !,
 433    atom_concat(/, Local, Path).
 434make_path(Dir, Local, Path) :-
 435    atomic_list_concat([Dir, /, Local], Path).
 436
 437
 438%!  absolute_url//
 439%
 440%   True if the input  describes  an   absolute  URL.  This means it
 441%   starts with a URL schema. We demand a   schema  of length > 1 to
 442%   avoid confusion with Windows drive letters.
 443
 444absolute_url -->
 445    lwalpha(_First),
 446    schema_chars(Rest),
 447    { Rest \== [] },
 448    ":",
 449    !.
 450
 451
 452                 /*******************************
 453                 *           SEQUENCES          *
 454                 *******************************/
 455
 456digits(L) -->
 457    digits(L, []).
 458
 459digits([C|T0], T) -->
 460    digit(C),
 461    !,
 462    digits(T0, T).
 463digits(T, T) -->
 464    [].
 465
 466
 467digit(C, [C|T], T) :- code_type(C, digit).
 468
 469                 /*******************************
 470                 *            RFC-3986          *
 471                 *******************************/
 472
 473%!  uri(-Parts)//
 474
 475url([protocol(Schema)|Parts]) -->
 476    schema(Schema),
 477    ":",
 478    !,
 479    hier_part(Schema, Parts, P2),
 480    query(P2, P3),
 481    fragment(P3, []).
 482url([protocol(http)|Parts]) -->         % implicit HTTP
 483    authority(Parts, [path(Path)]),
 484    path_abempty(Path).
 485
 486relative_uri(Parts) -->
 487    relative_part(Parts, P2),
 488    query(P2, P3),
 489    fragment(P3, []).
 490
 491relative_part(Parts, Tail) -->
 492    "//",
 493    !,
 494    authority(Parts, [path(Path)|Tail]),
 495    path_abempty(Path).
 496relative_part([path(Path)|T], T) -->
 497    (   path_absolute(Path)
 498    ;   path_noschema(Path)
 499    ;   path_empty(Path)
 500    ),
 501    !.
 502
 503http_location([path(Path)|P2]) -->
 504    path_abempty(Path),
 505    query(P2, P3),
 506    fragment(P3, []).
 507
 508%!  schema(-Atom)//
 509%
 510%   Schema  is  case-insensitive  and  the    canonical  version  is
 511%   lowercase.
 512%
 513%   ==
 514%   Schema ::= ALPHA *(ALPHA|DIGIT|"+"|"-"|".")
 515%   ==
 516
 517schema(Schema) -->
 518    lwalpha(C0),
 519    schema_chars(Codes),
 520    { atom_codes(Schema, [C0|Codes]) }.
 521
 522schema_chars([H|T]) -->
 523    schema_char(H),
 524    !,
 525    schema_chars(T).
 526schema_chars([]) -->
 527    [].
 528
 529schema_char(H) -->
 530    [C],
 531    { C < 128,
 532      (   code_type(C, alpha)
 533      ->  code_type(H, to_lower(C))
 534      ;   code_type(C, digit)
 535      ->  H = C
 536      ;   schema_extra(C)
 537      ->  H = C
 538      )
 539    }.
 540
 541schema_extra(0'+).
 542schema_extra(0'-).
 543schema_extra(0'.).      % 0'
 544
 545
 546%!  hier_part(+Schema, -Parts, ?Tail)//
 547
 548hier_part(file, [path(Path)|Tail], Tail) -->
 549    !,
 550    "//",
 551    (   win_drive_path(Path)
 552    ;   path_absolute(Path)
 553    ;   path_rootless(Path)
 554    ;   path_empty(Path)
 555    ),
 556    !.
 557hier_part(_, Parts, Tail) -->
 558    "//",
 559    !,
 560    authority(Parts, [path(Path)|Tail]),
 561    path_abempty(Path).
 562hier_part(_, [path(Path)|T], T) -->
 563    (   path_absolute(Path)
 564    ;   path_rootless(Path)
 565    ;   path_empty(Path)
 566    ),
 567    !.
 568
 569authority(Parts, Tail) -->
 570    user_info_chars(UserChars),
 571    "@",
 572    !,
 573    { atom_codes(User, UserChars),
 574      Parts = [user(User),host(Host)|T0]
 575    },
 576    host(Host),
 577    port(T0,Tail).
 578authority([host(Host)|T0], Tail) -->
 579    host(Host),
 580    port(T0, Tail).
 581
 582user_info_chars([H|T]) -->
 583    user_info_char(H),
 584    !,
 585    user_info_chars(T).
 586user_info_chars([]) -->
 587    [].
 588
 589user_info_char(_) --> "@", !, {fail}.
 590user_info_char(C) --> pchar(C).
 591
 592%host(Host) --> ip_literal(Host), !.            % TBD: IP6 addresses
 593host(Host) --> ip4_address(Host), !.
 594host(Host) --> reg_name(Host).
 595
 596ip4_address(Atom) -->
 597    i256_chars(Chars, [0'.|T0]),
 598    i256_chars(T0, [0'.|T1]),
 599    i256_chars(T1, [0'.|T2]),
 600    i256_chars(T2, []),
 601    { atom_codes(Atom, Chars) }.
 602
 603i256_chars(Chars, T) -->
 604    digits(Chars, T),
 605    { \+ \+ ( T = [],
 606              Chars \== [],
 607              number_codes(I, Chars),
 608              I < 256
 609            )
 610    }.
 611
 612reg_name(Host) -->
 613    reg_name_chars(Chars),
 614    { atom_codes(Host, Chars) }.
 615
 616reg_name_chars([H|T]) -->
 617    reg_name_char(H),
 618    !,
 619    reg_name_chars(T).
 620reg_name_chars([]) -->
 621    [].
 622
 623reg_name_char(C) -->
 624    pchar(C),
 625    { C \== 0':,
 626      C \== 0'@
 627    }.
 628
 629port([port(Port)|T], T) -->
 630    ":",
 631    !,
 632    digit(D0),
 633    digits(Ds),
 634    { number_codes(Port, [D0|Ds]) }.
 635port(T, T) -->
 636    [].
 637
 638path_abempty(Path) -->
 639    segments_chars(Chars, []),
 640    {   Chars == []
 641    ->  Path = '/'
 642    ;   atom_codes(Path, Chars)
 643    }.
 644
 645
 646win_drive_path(Path) -->
 647    drive_letter(C0),
 648    ":",
 649    (   "/"
 650    ->  {Codes = [C0, 0':, 0'/|Chars]}
 651    ;   {Codes = [C0, 0':|Chars]}
 652    ),
 653    segment_nz_chars(Chars, T0),
 654    segments_chars(T0, []),
 655    { atom_codes(Path, Codes) }.
 656
 657
 658path_absolute(Path) -->
 659    "/",
 660    segment_nz_chars(Chars, T0),
 661    segments_chars(T0, []),
 662    { atom_codes(Path, [0'/| Chars]) }.
 663
 664path_noschema(Path) -->
 665    segment_nz_nc_chars(Chars, T0),
 666    segments_chars(T0, []),
 667    { atom_codes(Path, Chars) }.
 668
 669path_rootless(Path) -->
 670    segment_nz_chars(Chars, T0),
 671    segments_chars(T0, []),
 672    { atom_codes(Path, Chars) }.
 673
 674path_empty('/') -->
 675    "".
 676
 677segments_chars([0'/|Chars], T) -->      % 0'
 678    "/",
 679    !,
 680    segment_chars(Chars, T0),
 681    segments_chars(T0, T).
 682segments_chars(T, T) -->
 683    [].
 684
 685segment_chars([H|T0], T) -->
 686    pchar(H),
 687    !,
 688    segment_chars(T0, T).
 689segment_chars(T, T) -->
 690    [].
 691
 692segment_nz_chars([H|T0], T) -->
 693    pchar(H),
 694    segment_chars(T0, T).
 695
 696segment_nz_nc_chars([H|T0], T) -->
 697    segment_nz_nc_char(H),
 698    !,
 699    segment_nz_nc_chars(T0, T).
 700segment_nz_nc_chars(T, T) -->
 701    [].
 702
 703segment_nz_nc_char(_) --> ":", !, {fail}.
 704segment_nz_nc_char(C) --> pchar(C).
 705
 706
 707%!  query(-Parts, ?Tail)// is det.
 708%
 709%   Extract &Name=Value, ...
 710
 711query([search(Params)|T], T) -->
 712    "?",
 713    !,
 714    search(Params).
 715query(T,T) -->
 716    [].
 717
 718search([Parameter|Parameters])-->
 719    parameter(Parameter),
 720    !,
 721    (   search_sep
 722    ->  search(Parameters)
 723    ;   { Parameters = [] }
 724    ).
 725search([]) -->
 726    [].
 727
 728parameter(Param)-->
 729    !,
 730    search_chars(NameS),
 731    { atom_codes(Name, NameS)
 732    },
 733    (   "="
 734    ->  search_value_chars(ValueS),
 735        { atom_codes(Value, ValueS),
 736          Param = (Name = Value)
 737        }
 738    ;   { Param = Name
 739        }
 740    ).
 741
 742search_chars([C|T]) -->
 743    search_char(C),
 744    !,
 745    search_chars(T).
 746search_chars([]) -->
 747    [].
 748
 749search_char(_) --> search_sep, !, { fail }.
 750search_char(_) --> "=", !, { fail }.
 751search_char(C) --> fragment_char(C).
 752
 753search_value_chars([C|T]) -->
 754    search_value_char(C),
 755    !,
 756    search_value_chars(T).
 757search_value_chars([]) -->
 758    [].
 759
 760search_value_char(_) --> search_sep, !, { fail }.
 761search_value_char(C) --> fragment_char(C).
 762
 763%!  search_sep// is semidet.
 764%
 765%   Matches a search-parameter separator.  Traditionally, this is the
 766%   &-char, but these days there are `newstyle' ;-char separators.
 767%
 768%   @see http://perldoc.perl.org/CGI.html
 769%   @tbd This should be configurable
 770
 771search_sep --> "&", !.
 772search_sep --> ";".
 773
 774
 775%!  fragment(-Fragment, ?Tail)//
 776%
 777%   Extract the fragment (after the =#=)
 778
 779fragment([fragment(Fragment)|T], T) -->
 780    "#",
 781    !,
 782    fragment_chars(Codes),
 783    { atom_codes(Fragment, Codes) }.
 784fragment(T, T) -->
 785    [].
 786
 787fragment_chars([H|T]) -->
 788    fragment_char(H),
 789    !,
 790    fragment_chars(T).
 791fragment_chars([]) -->
 792    [].
 793
 794
 795%!  fragment_char(-Char)
 796%
 797%   Find a fragment character.
 798
 799fragment_char(C)   --> pchar(C), !.
 800fragment_char(0'/) --> "/", !.
 801fragment_char(0'?) --> "?", !.
 802fragment_char(0'[) --> "[", !.          % Not according RDF3986!
 803fragment_char(0']) --> "]", !.
 804
 805
 806                 /*******************************
 807                 *      CHARACTER CLASSES       *
 808                 *******************************/
 809
 810%!  pchar(-Code)//
 811%
 812%   unreserved|pct_encoded|sub_delim|":"|"@"
 813%
 814%   Performs UTF-8 decoding of percent encoded strings.
 815
 816pchar(0'\s) --> "+", !.
 817pchar(C) -->
 818    [C],
 819    {   unreserved(C)
 820    ;   sub_delim(C)
 821    ;   C == 0':
 822    ;   C == 0'@
 823    },
 824    !.
 825pchar(C) -->
 826    percent_coded(C).
 827
 828%!  lwalpha(-C)//
 829%
 830%   Demand alpha, return as lowercase
 831
 832lwalpha(H) -->
 833    [C],
 834    { C < 128,
 835      code_type(C, alpha),
 836      code_type(H, to_lower(C))
 837    }.
 838
 839drive_letter(C) -->
 840    [C],
 841    { C < 128,
 842      code_type(C, alpha)
 843    }.
 844
 845
 846                 /*******************************
 847                 *      RESERVED CHARACTERS     *
 848                 *******************************/
 849
 850%!  sub_delim(?Code)
 851%
 852%   Sub-delimiters
 853
 854sub_delim(0'!).
 855sub_delim(0'$).
 856sub_delim(0'&).
 857sub_delim(0'').
 858sub_delim(0'().
 859sub_delim(0')).
 860sub_delim(0'*).
 861sub_delim(0'+).
 862sub_delim(0',).
 863sub_delim(0';).
 864sub_delim(0'=).
 865
 866
 867%!  unreserved(+C)
 868%
 869%   Characters that can be represented without percent escaping
 870%   RFC 3986, section 2.3
 871
 872term_expansion(unreserved(map), Clauses) :-
 873    findall(unreserved(C), unreserved_(C), Clauses).
 874
 875unreserved_(C) :-
 876    between(1, 128, C),
 877    code_type(C, alnum).
 878unreserved_(0'-).
 879unreserved_(0'.).
 880unreserved_(0'_).
 881unreserved_(0'~).                       % 0'
 882
 883unreserved(map).                        % Expanded
 884
 885
 886                 /*******************************
 887                 *              FORMS           *
 888                 *******************************/
 889
 890/* - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 891Encoding/decoding of form-fields  using   the  popular  www-form-encoded
 892encoding used with the HTTP GET.
 893- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - */
 894
 895%!  www_form_encode(+Value, -XWWWFormEncoded) is det.
 896%!  www_form_encode(-Value, +XWWWFormEncoded) is det.
 897%
 898%   En/decode   to/from   application/x-www-form-encoded.   Encoding
 899%   encodes all characters  except  RFC   3986  _unreserved_  (ASCII
 900%   =alnum= (see code_type/2)), and  one   of  "-._~"  using percent
 901%   encoding.  Newline  is  mapped  to  =|%OD%OA|=.  When  decoding,
 902%   newlines appear as a single newline (10) character.
 903%
 904%   Note that a space  is  encoded   as  =|%20|=  instead  of =|+|=.
 905%   Decoding decodes both to a space.
 906%
 907%   @deprecated Use uri_encoded/3 for new code.
 908
 909www_form_encode(Value, Encoded) :-
 910    atomic(Value),
 911    !,
 912    atom_codes(Value, Codes),
 913    phrase(www_encode(Codes, []), EncCodes),
 914    atom_codes(Encoded, EncCodes).
 915www_form_encode(Value, Encoded) :-
 916    atom_codes(Encoded, EncCodes),
 917    phrase(www_decode(Codes), EncCodes),
 918    atom_codes(Value, Codes).
 919
 920%!  www_encode(+Codes, +ExtraUnescaped)//
 921
 922www_encode([0'\r, 0'\n|T], Extra) -->
 923    !,
 924    "%0D%0A",
 925    www_encode(T, Extra).
 926www_encode([0'\n|T], Extra) -->
 927    !,
 928    "%0D%0A",
 929    www_encode(T, Extra).
 930www_encode([H|T], Extra) -->
 931    percent_encode(H, Extra),
 932    www_encode(T, Extra).
 933www_encode([], _) -->
 934    "".
 935
 936percent_encode(C, _Extra) -->
 937    { unreserved(C) },
 938    !,
 939    [C].
 940percent_encode(C, Extra) -->
 941    { memberchk(C, Extra) },
 942    !,
 943    [C].
 944%percent_encode(0' , _) --> !, "+".     % Deprecated: use %20
 945percent_encode(C, _) -->
 946    { C =< 127 },
 947    !,
 948    percent_byte(C).
 949percent_encode(C, _) -->                % Unicode characters
 950    { current_prolog_flag(url_encoding, utf8),
 951      !,
 952      phrase(utf8_codes([C]), Bytes)
 953    },
 954    percent_bytes(Bytes).
 955percent_encode(C, _) -->
 956    { C =< 255 },
 957    !,
 958    percent_byte(C).
 959percent_encode(_C, _) -->
 960    { representation_error(url_character)
 961    }.
 962
 963percent_bytes([]) -->
 964    "".
 965percent_bytes([H|T]) -->
 966    percent_byte(H),
 967    percent_bytes(T).
 968
 969percent_byte(C) -->
 970    [0'%, D1, D2],
 971    {   nonvar(C)
 972    ->  Dv1 is (C>>4 /\ 0xf),
 973        Dv2 is (C /\ 0xf),
 974        code_type(D1, xdigit(Dv1)),
 975        code_type(D2, xdigit(Dv2))
 976    ;   code_type(D1, xdigit(Dv1)),
 977        code_type(D2, xdigit(Dv2)),
 978        C is ((Dv1)<<4) + Dv2
 979    }.
 980
 981percent_coded(C) -->
 982    percent_byte(C0),
 983    !,
 984    (   { C0 == 13                  % %0D%0A --> \n
 985        },
 986        "%0",
 987        ( "A" ; "a" )
 988    ->  { C = 10
 989        }
 990    ;   { C0 >= 0xc0 },             % UTF-8 lead-in
 991        utf8_cont(Cs),
 992        { phrase(utf8_codes([C]), [C0|Cs]) }
 993    ->  []
 994    ;   { C = C0
 995        }
 996    ).
 997
 998%!  www_decode(-Codes)//
 999
1000www_decode([0' |T]) -->
1001    "+",
1002    !,
1003    www_decode(T).
1004www_decode([C|T]) -->
1005    percent_coded(C),
1006    !,
1007    www_decode(T).
1008www_decode([C|T]) -->
1009    [C],
1010    !,
1011    www_decode(T).
1012www_decode([]) -->
1013    [].
1014
1015utf8_cont([H|T]) -->
1016    percent_byte(H),
1017    { between(0x80, 0xbf, H) },
1018    !,
1019    utf8_cont(T).
1020utf8_cont([]) -->
1021    [].
1022
1023
1024%!  set_url_encoding(?Old, +New) is semidet.
1025%
1026%   Query and set the encoding for URLs.  The default is =utf8=.
1027%   The only other defined value is =iso_latin_1=.
1028%
1029%   @tbd    Having a global flag is highly inconvenient, but a
1030%           work-around for old sites using ISO Latin 1 encoding.
1031
1032:- create_prolog_flag(url_encoding, utf8, [type(atom)]).
1033
1034set_url_encoding(Old, New) :-
1035    current_prolog_flag(url_encoding, Old),
1036    (   Old == New
1037    ->  true
1038    ;   must_be(oneof([utf8, iso_latin_1]), New),
1039        set_prolog_flag(url_encoding, New)
1040    ).
1041
1042
1043                 /*******************************
1044                 *       IRI PROCESSING         *
1045                 *******************************/
1046
1047%!  url_iri(+Encoded, -Decoded) is det.
1048%!  url_iri(-Encoded, +Decoded) is det.
1049%
1050%   Convert between a URL, encoding in US-ASCII   and an IRI. An IRI
1051%   is a fully expanded Unicode string.   Unicode  strings are first
1052%   encoded into UTF-8, after which %-encoding takes place.
1053
1054url_iri(Encoded, Decoded) :-
1055    nonvar(Encoded),
1056    !,
1057    (   sub_atom(Encoded, _, _, _, '%')
1058    ->  atom_codes(Encoded, Codes),
1059        unescape_precent(Codes, UTF8),
1060        phrase(utf8_codes(Unicodes), UTF8),
1061        atom_codes(Decoded, Unicodes)
1062    ;   Decoded = Encoded
1063    ).
1064url_iri(URL, IRI) :-
1065    atom_codes(IRI, IRICodes),
1066    atom_codes('/:?#&=', ExtraEscapes),
1067    phrase(www_encode(IRICodes, ExtraEscapes), UrlCodes),
1068    atom_codes(URL, UrlCodes).
1069
1070
1071unescape_precent([], []).
1072unescape_precent([0'%,C1,C2|T0], [H|T]) :-     %'
1073    !,
1074    code_type(C1, xdigit(D1)),
1075    code_type(C2, xdigit(D2)),
1076    H is D1*16 + D2,
1077    unescape_precent(T0, T).
1078unescape_precent([H|T0], [H|T]) :-
1079    unescape_precent(T0, T).
1080
1081
1082                 /*******************************
1083                 *           FORM DATA          *
1084                 *******************************/
1085
1086%!  parse_url_search(?Spec, ?Fields:list(Name=Value)) is det.
1087%
1088%   Construct or analyze an HTTP   search  specification. This deals
1089%   with       form       data       using       the       MIME-type
1090%   =application/x-www-form-urlencoded=  as  used   in    HTTP   GET
1091%   requests.
1092
1093parse_url_search(Spec, Fields) :-
1094    atomic(Spec),
1095    !,
1096    atom_codes(Spec, Codes),
1097    phrase(search(Fields), Codes).
1098parse_url_search(Codes, Fields) :-
1099    is_list(Codes),
1100    !,
1101    phrase(search(Fields), Codes).
1102parse_url_search(Codes, Fields) :-
1103    must_be(list, Fields),
1104    phrase(csearch(Fields, []), Codes).
1105
1106
1107                 /*******************************
1108                 *          FILE URLs           *
1109                 *******************************/
1110
1111%!  file_name_to_url(+File, -URL) is det.
1112%!  file_name_to_url(-File, +URL) is semidet.
1113%
1114%   Translate between a filename and a file:// URL.
1115%
1116%   @tbd    Current implementation does not deal with paths that
1117%           need special encoding.
1118
1119file_name_to_url(File, FileURL) :-
1120    nonvar(File),
1121    !,
1122    absolute_file_name(File, Path),
1123    atom_concat('file://', Path, FileURL),
1124    !.
1125file_name_to_url(File, FileURL) :-
1126    atom_concat('file://', File, FileURL),
1127    !.