View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2009-2016, VU University Amsterdam
   7    All rights reserved.
   8
   9    Redistribution and use in source and binary forms, with or without
  10    modification, are permitted provided that the following conditions
  11    are met:
  12
  13    1. Redistributions of source code must retain the above copyright
  14       notice, this list of conditions and the following disclaimer.
  15
  16    2. Redistributions in binary form must reproduce the above copyright
  17       notice, this list of conditions and the following disclaimer in
  18       the documentation and/or other materials provided with the
  19       distribution.
  20
  21    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  22    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  23    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  24    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  25    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  26    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  27    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  28    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  29    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  30    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  31    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  32    POSSIBILITY OF SUCH DAMAGE.
  33*/
  34
  35:- module(csv,
  36          [ csv//1,                     % +Rows
  37            csv//2,                     % +Rows, +Options
  38            csv_read_file/2,            % +File, -Data
  39            csv_read_file/3,            % +File, -Data, +Options
  40            csv_read_file_row/3,        % +File, -Row, +Options
  41            csv_write_file/2,           % +File, +Data
  42            csv_write_file/3,           % +File, +Data, +Options
  43            csv_write_stream/3          % +Stream, +Data, +Options
  44          ]).
  45:- use_module(library(record)).
  46:- use_module(library(error)).
  47:- use_module(library(pure_input)).
  48:- use_module(library(debug)).
  49:- use_module(library(option)).
  50
  51/** <module> Process CSV (Comma-Separated Values) data
  52
  53This library parses and generates CSV data.   CSV data is represented in
  54Prolog as a list of rows. Each row   is  a compound term, where all rows
  55have the same name and arity.
  56
  57@tbd    Implement immediate assert of the data to avoid possible stack
  58        overflows.
  59@tbd    Writing creates an intermediate code-list, possibly overflowing
  60        resources.  This waits for pure output!
  61@see RFC 4180
  62*/
  63
  64:- predicate_options(csv//2, 2,
  65                     [ separator(nonneg),       % mustv be code
  66                       strip(boolean),
  67                       ignore_quotes(boolean),
  68                       convert(boolean),
  69                       functor(atom),
  70                       arity(-nonneg),          % actually ?nonneg
  71                       match_arity(boolean)
  72                     ]).
  73:- predicate_options(csv_read_file/3, 3,
  74                     [ pass_to(csv//2, 2),
  75                       pass_to(phrase_from_file/3, 3)
  76                     ]).
  77:- predicate_options(csv_read_file_row/3, 3,
  78                     [ pass_to(csv//2, 2),
  79                       pass_to(open/4, 4)
  80                     ]).
  81:- predicate_options(csv_write_file/3, 3,
  82                     [ pass_to(csv//2, 2),
  83                       pass_to(open/4, 4)
  84                     ]).
  85:- predicate_options(csv_write_stream/3, 3,
  86                     [ pass_to(csv//2, 2)
  87                     ]).
  88
  89
  90:- record
  91    csv_options(separator:integer=0',,
  92                strip:boolean=false,
  93                ignore_quotes:boolean=false,
  94                convert:boolean=true,
  95                functor:atom=row,
  96                arity:integer,
  97                match_arity:boolean=true).
  98
  99
 100%!  csv_read_file(+File, -Rows) is det.
 101%!  csv_read_file(+File, -Rows, +Options) is det.
 102%
 103%   Read a CSV file into a list of   rows. Each row is a Prolog term
 104%   with the same arity. Options  is   handed  to  csv//2. Remaining
 105%   options  are  processed  by    phrase_from_file/3.  The  default
 106%   separator depends on the file name   extension and is =|\t|= for
 107%   =|.tsv|= files and =|,|= otherwise.
 108%
 109%   Suppose we want to create a predicate   table/6  from a CSV file
 110%   that we know contains 6 fields  per   record.  This  can be done
 111%   using the code below. Without the   option  arity(6), this would
 112%   generate a predicate table/N, where N   is  the number of fields
 113%   per record in the data.
 114%
 115%       ==
 116%       ?- csv_read_file(File, Rows, [functor(table), arity(6)]),
 117%          maplist(assert, Rows).
 118%       ==
 119
 120
 121csv_read_file(File, Rows) :-
 122    csv_read_file(File, Rows, []).
 123
 124csv_read_file(File, Rows, Options) :-
 125    default_separator(File, Options, Options1),
 126    make_csv_options(Options1, Record, RestOptions),
 127    phrase_from_file(csv_roptions(Rows, Record), File, RestOptions).
 128
 129
 130default_separator(File, Options0, Options) :-
 131    (   option(separator(_), Options0)
 132    ->  Options = Options0
 133    ;   file_name_extension(_, Ext0, File),
 134        downcase_atom(Ext0, Ext),
 135        ext_separator(Ext, Sep)
 136    ->  Options = [separator(Sep)|Options0]
 137    ;   Options = Options0
 138    ).
 139
 140ext_separator(csv, 0',).
 141ext_separator(tsv, 0'\t).
 142
 143
 144%!  csv(?Rows)// is det.
 145%!  csv(?Rows, +Options)// is det.
 146%
 147%   Prolog DCG to `read/write' CSV data.  Options:
 148%
 149%       * separator(+Code)
 150%       The comma-separator.  Must be a character code.  Default is
 151%       (of course) the comma. Character codes can be specified
 152%       using the 0' notion. E.g., using =|separator(0';)|= parses
 153%       a semicolon separated file.
 154%
 155%       * ignore_quotes(+Boolean)
 156%       If =true= (default false), threat double quotes as a normal
 157%       character.
 158%
 159%       * strip(+Boolean)
 160%       If =true= (default =false=), strip leading and trailing
 161%       blank space.  RFC4180 says that blank space is part of the
 162%       data.
 163%
 164%       * convert(+Boolean)
 165%       If =true= (default), use name/2 on the field data.  This
 166%       translates the field into a number if possible.
 167%
 168%       * functor(+Atom)
 169%       Functor to use for creating row terms.  Default is =row=.
 170%
 171%       * arity(?Arity)
 172%       Number of fields in each row.  This predicate raises
 173%       a domain_error(row_arity(Expected), Found) if a row is
 174%       found with different arity.
 175%
 176%       * match_arity(+Boolean)
 177%       If =false= (default =true=), do not reject CSV files where
 178%       lines provide a varying number of fields (columns).  This
 179%       can be a work-around to use some incorrect CSV files.
 180
 181csv(Rows) -->
 182    csv(Rows, []).
 183
 184csv(Rows, Options) -->
 185    { make_csv_options(Options, Record, _) },
 186    csv_roptions(Rows, Record).
 187
 188csv_roptions(Rows, Record) -->
 189    { ground(Rows) },
 190    !,
 191    emit_csv(Rows, Record).
 192csv_roptions(Rows, Record) -->
 193    csv_data(Rows, Record).
 194
 195csv_data([], _) -->
 196    eof,
 197    !.
 198csv_data([Row|More], Options) -->
 199    row(Row, Options),
 200    !,
 201    { debug(csv, 'Row: ~p', [Row]) },
 202    csv_data(More, Options).
 203
 204eof([], []).
 205
 206row(Row, Options) -->
 207    fields(Fields, Options),
 208    { csv_options_functor(Options, Functor),
 209      Row =.. [Functor|Fields],
 210      functor(Row, _, Arity),
 211      check_arity(Options, Arity)
 212    }.
 213
 214check_arity(Options, Arity) :-
 215    csv_options_arity(Options, Arity),
 216    !.
 217check_arity(Options, _) :-
 218    csv_options_match_arity(Options, false),
 219    !.
 220check_arity(Options, Arity) :-
 221    csv_options_arity(Options, Expected),
 222    domain_error(row_arity(Expected), Arity).
 223
 224fields([F|T], Options) -->
 225    field(F, Options),
 226    (   separator(Options)
 227    ->  fields(T, Options)
 228    ;   end_of_record
 229    ->  { T = [] }
 230    ).
 231
 232field(Value, Options) -->
 233    "\"",
 234    { csv_options_ignore_quotes(Options, false) },
 235    !,
 236    string_codes(Codes),
 237    { make_value(Codes, Value, Options) }.
 238field(Value, Options) -->
 239    { csv_options_strip(Options, true) },
 240    !,
 241    stripped_field(Value, Options).
 242field(Value, Options) -->
 243    { csv_options_separator(Options, Sep) },
 244    field_codes(Codes, Sep),
 245    { make_value(Codes, Value, Options) }.
 246
 247
 248stripped_field(Value, Options) -->
 249    ws,
 250    (   "\"",
 251        { csv_options_strip(Options, false) }
 252    ->  string_codes(Codes),
 253        ws
 254    ;   { csv_options_separator(Options, Sep) },
 255        field_codes(Codes0, Sep),
 256        { strip_trailing_ws(Codes0, Codes) }
 257    ),
 258    { make_value(Codes, Value, Options) }.
 259
 260ws --> " ", !, ws.
 261ws --> "\t", !, ws.
 262ws --> "".
 263
 264strip_trailing_ws(List, Stripped) :-
 265    append(Stripped, WS, List),
 266    all_ws(WS).
 267
 268all_ws([]).
 269all_ws([32|T]) :- all_ws(T).
 270all_ws([9|T]) :- all_ws(T).
 271
 272
 273%!  string_codes(-Codes)
 274%
 275%   Process a double-quotes string where  the   quote  is escaped by
 276%   doubling it. Eats the terminating double-quote.
 277
 278string_codes(List) -->
 279    [H],
 280    (   { H == 0'" }
 281    ->  (   "\""
 282        ->  { List = [H|T] },
 283            string_codes(T)
 284        ;   { List = [] }
 285        )
 286    ;   { List = [H|T] },
 287        string_codes(T)
 288    ).
 289
 290field_codes([], Sep), [Sep] --> [Sep], !.
 291field_codes([], _), "\n" --> "\r\n", !.
 292field_codes([], _), "\n" --> "\n", !.
 293field_codes([H|T], Sep) --> [H], !, field_codes(T, Sep).
 294field_codes([], _) --> [].              % unterminated last record
 295
 296make_value(Codes, Value, Options) :-
 297    csv_options_convert(Options, true),
 298    !,
 299    name(Value, Codes).
 300make_value(Codes, Value, _) :-
 301    atom_codes(Value, Codes).
 302
 303separator(Options) -->
 304    { csv_options_separator(Options, Sep) },
 305    [Sep].
 306
 307end_of_record --> "\n".
 308end_of_record --> "\r\n".
 309end_of_record --> eof.                  % unterminated last record
 310
 311
 312%!  csv_read_file_row(+File, -Row, +Options) is nondet.
 313%
 314%   True when Row is a row in File.  First unifies Row with the first
 315%   row in File. Backtracking  yields  the   second,  ...  row.  This
 316%   interface  is  an  alternative  to  csv_read_file/3  that  avoids
 317%   loading all rows in memory.  Note   that  this interface does not
 318%   guarantee that all rows in File have the same arity.
 319%
 320%   In addition to the  options   of  csv_read_file/3, this predicate
 321%   processes the option:
 322%
 323%     * line(-Line)
 324%     Line is unified with the 1-based line-number from which Row is
 325%     read.  Note that Line is not the physical line, but rather the
 326%     _logical_ record number.
 327%
 328%   @tbd    Input is read line by line.  If a record separator is
 329%           embedded in a quoted field, parsing the record fails and
 330%           another line is added to the input.  This does not nicely
 331%           deal with other reasons why parsing the row may fail.
 332
 333csv_read_file_row(File, Row, Options) :-
 334    default_separator(File, Options, Options1),
 335    make_csv_options(Options1, RecordOptions, Options2),
 336    select_option(line(Line), Options2, RestOptions, _),
 337    setup_call_cleanup(
 338        open(File, read, Stream, RestOptions),
 339        csv_read_stream_row(Stream, Row, Line, RecordOptions),
 340        close(Stream)).
 341
 342csv_read_stream_row(Stream, _Row, _Line, _Options) :-
 343    at_end_of_stream(Stream),
 344    !,
 345    fail.
 346csv_read_stream_row(Stream, Row, Line, Options) :-
 347    between(1, infinite, Line),
 348    read_row(Stream, Row, Options),
 349    (   at_end_of_stream(Stream)            % make reading the last row
 350    ->  !                                   % deterministic.
 351    ;   true
 352    ).
 353
 354read_row(Stream, Row, Options) :-
 355    read_lines_to_codes(Stream, Codes),
 356    phrase(row(Row0, Options), Codes),
 357    !,
 358    Row = Row0.
 359
 360read_lines_to_codes(Stream, Codes) :-
 361    read_line_to_codes(Stream, Codes, Tail),
 362    (   Tail == []
 363    ->  true
 364    ;   Tail = []
 365    ;   read_lines_to_codes(Stream, Tail)
 366    ).
 367
 368
 369                /*******************************
 370                *             OUTPUT           *
 371                *******************************/
 372
 373%!  csv_write_file(+File, +Data) is det.
 374%!  csv_write_file(+File, +Data, +Options) is det.
 375%
 376%   Write a list of Prolog terms to a CSV file.  Options are given
 377%   to csv//2.  Remaining options are given to open/4.  The  default
 378%   separator depends on the file name   extension and is =|\t|= for
 379%   =|.tsv|= files and =|,|= otherwise.
 380
 381csv_write_file(File, Data) :-
 382    csv_write_file(File, Data, []).
 383
 384csv_write_file(File, Data, Options) :-
 385    must_be(list, Data),
 386    default_separator(File, Options, Options1),
 387    make_csv_options(Options1, Record, RestOptions),
 388    phrase(emit_csv(Data, Record), String),
 389    setup_call_cleanup(
 390        open(File, write, Out, RestOptions),
 391        format(Out, '~s', [String]),
 392        close(Out)).
 393
 394
 395emit_csv([], _) --> [].
 396emit_csv([H|T], Options) -->
 397    emit_row(H, Options), "\r\n",   % RFC 4180 demands \r\n
 398    emit_csv(T, Options).
 399
 400emit_row(Row, Options) -->
 401    { Row =.. [_|Fields] },
 402    emit_fields(Fields, Options).
 403
 404emit_fields([H|T], Options) -->
 405    emit_field(H, Options),
 406    (   { T == [] }
 407        ->  []
 408        ;   { csv_options_separator(Options, Sep) },
 409        [Sep],
 410        emit_fields(T, Options)
 411    ).
 412
 413emit_field(H, Options) -->
 414    { (   atom(H)
 415      ->  atom_codes(H, Codes)
 416      ;   string(H)
 417      ->  string_codes(H, Codes)
 418      )
 419    },
 420    !,
 421    (   { needs_quotes(H, Options) }
 422    ->  "\"", emit_string(Codes), "\""
 423    ;   emit_codes(Codes)
 424    ).
 425emit_field([], _) -->
 426    !,
 427    { atom_codes('[]', Codes) },
 428    emit_codes(Codes).
 429emit_field(H, _) -->
 430    { number_codes(H,Codes) },
 431    emit_codes(Codes).
 432
 433needs_quotes(Atom, _) :-
 434    sub_atom(Atom, _, _, _, '"'),
 435    !.
 436needs_quotes(Atom, _) :-
 437    sub_atom(Atom, _, _, _, '\n'),
 438    !.
 439needs_quotes(Atom, _) :-
 440    sub_atom(Atom, _, _, _, '\r'),
 441    !.
 442needs_quotes(Atom, Options) :-
 443    csv_options_separator(Options, Sep),
 444    char_code(Char, Sep),
 445    sub_atom(Atom, _, _, _, Char),
 446    !.
 447
 448emit_string([]) --> "".
 449emit_string([0'"|T]) --> !, "\"\"", emit_string(T).
 450emit_string([H|T]) --> [H], emit_string(T).
 451
 452emit_codes([]) --> "".
 453emit_codes([0'"|T]) --> !, "\"\"", emit_codes(T).
 454emit_codes([H|T]) --> [H], emit_codes(T).
 455
 456
 457%%     csv_write_stream(+Stream, +Data, +Options) is det.
 458%
 459%      Write  the  rows  in  Data  to    Stream.   This  is  similar  to
 460%      csv_write_file/3,  but  can  deal  with  data  that  is  produced
 461%      incrementally. The example  below  saves   all  answers  from the
 462%      predicate data/3 to File.
 463%
 464%        ==
 465%        save_data(File) :-
 466%           setup_call_cleanup(
 467%               open(File, write, Out),
 468%               forall(data(C1,C2,C3),
 469%                      csv_write_stream(Out, [row(C1,C2,C3)], [])),
 470%               close(Out)),
 471%        ==
 472
 473csv_write_stream(Stream, Data, Options) :-
 474    must_be(list, Data),
 475    make_csv_options(Options, Record, _),
 476    phrase(emit_csv(Data, Record), String),
 477    format(Stream, '~s', [String]).