View source with raw comments or as raw
   1/*  Part of SWI-Prolog
   2
   3    Author:        Jan Wielemaker
   4    E-mail:        J.Wielemaker@vu.nl
   5    WWW:           http://www.swi-prolog.org
   6    Copyright (c)  2006-2015, University of Amsterdam
   7                              VU University Amsterdam
   8    All rights reserved.
   9
  10    Redistribution and use in source and binary forms, with or without
  11    modification, are permitted provided that the following conditions
  12    are met:
  13
  14    1. Redistributions of source code must retain the above copyright
  15       notice, this list of conditions and the following disclaimer.
  16
  17    2. Redistributions in binary form must reproduce the above copyright
  18       notice, this list of conditions and the following disclaimer in
  19       the documentation and/or other materials provided with the
  20       distribution.
  21
  22    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  23    "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  24    LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  25    FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  26    COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  27    INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  28    BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  29    LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  30    CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  31    LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
  32    ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  33    POSSIBILITY OF SUCH DAMAGE.
  34*/
  35
  36:- module(rdf_litindex,
  37          [ rdf_set_literal_index_option/1,     % +Options
  38            rdf_tokenize_literal/2,             % +Literal, -Tokens
  39            rdf_find_literal/2,                 % +Spec, -Literal
  40            rdf_find_literals/2,                % +Spec, -ListOfLiterals
  41            rdf_token_expansions/2,             % +Spec, -Expansions
  42            rdf_stopgap_token/1,                % -Token
  43
  44            rdf_literal_index/2,                % +Type, -Index
  45            rdf_delete_literal_index/1          % +Type
  46          ]).
  47:- use_module(rdf_db).
  48:- use_module(library(debug)).
  49:- use_module(library(lists)).
  50:- use_module(library(error)).
  51:- use_module(library(apply)).
  52:- if(exists_source(library(snowball))).
  53:- use_module(library(snowball)).
  54:- else.
  55:- use_module(library(porter_stem)).
  56:- endif.
  57:- use_module(library(double_metaphone)).
  58
  59/** <module> Search literals
  60
  61This module finds literals of the RDF  database based on words, stemming
  62and _sounds like_ (metaphone).  The normal user-level predicate is
  63
  64  - rdf_find_literals/2
  65*/
  66
  67:- dynamic
  68    literal_map/2,                  % Type, -Map
  69    map_building/2,                 % Type, -Queue
  70    new_token/2,                    % Hook
  71    setting/1,
  72    stopgap/1.
  73:- volatile
  74    literal_map/2.
  75:- multifile
  76    tokenization/2,                 % +Literal, -Tokens
  77    exclude_from_index/2.           % +Which, +Token
  78
  79
  80setting(verbose(false)).                % print progress messages
  81setting(index_threads(1)).              % # threads for creating the index
  82setting(index(thread(1))).              % Use a thread for incremental updates
  83setting(stopgap_threshold(50000)).      % consider token a stopgap over N
  84
  85%!  rdf_set_literal_index_option(+Options:list)
  86%
  87%   Set options for the literal package.  Currently defined options
  88%
  89%           * verbose(Bool)
  90%           If =true=, print progress messages while building the
  91%           index tables.
  92%
  93%           * index_threads(+Count)
  94%           Number of threads to use for initial indexing of
  95%           literals
  96%
  97%           * index(+How)
  98%           How to deal with indexing new literals.  How is one of
  99%           =self= (execute in the same thread), thread(N) (execute
 100%           in N concurrent threads) or =default= (depends on number
 101%           of cores).
 102%
 103%           * stopgap_threshold(+Count)
 104%           Add a token to the dynamic stopgap set if it appears in
 105%           more than Count literals.  The default is 50,000.
 106
 107rdf_set_literal_index_option([]) :- !.
 108rdf_set_literal_index_option([H|T]) :-
 109    !,
 110    set_option(H),
 111    rdf_set_literal_index_option(T).
 112rdf_set_literal_index_option(Option) :-
 113    set_option(Option).
 114
 115set_option(Term) :-
 116    check_option(Term),
 117    functor(Term, Name, Arity),
 118    functor(General, Name, Arity),
 119    retractall(setting(General)),
 120    assert(setting(Term)).
 121
 122check_option(X) :-
 123    var(X),
 124    !,
 125    instantiation_error(X).
 126check_option(verbose(X)) :-
 127    !,
 128    must_be(boolean, X).
 129check_option(index_threads(Count)) :-
 130    !,
 131    must_be(nonneg, Count).
 132check_option(stopgap_threshold(Count)) :-
 133    !,
 134    must_be(nonneg, Count).
 135check_option(index(How)) :-
 136    !,
 137    must_be(oneof([default,thread(_),self]), How).
 138check_option(Option) :-
 139    domain_error(literal_option, Option).
 140
 141
 142                 /*******************************
 143                 *            QUERY             *
 144                 *******************************/
 145
 146%!  rdf_find_literal(+Spec, -Literal) is nondet.
 147%!  rdf_find_literals(+Spec, -Literals) is det.
 148%
 149%   Find literals in the RDF database matching Spec.  Spec is defined
 150%   as:
 151%
 152%   ==
 153%   Spec ::= and(Spec,Spec)
 154%   Spec ::= or(Spec,Spec)
 155%   Spec ::= not(Spec)
 156%   Spec ::= sounds(Like)
 157%   Spec ::= stem(Like)             % same as stem(Like, en)
 158%   Spec ::= stem(Like, Lang)
 159%   Spec ::= prefix(Prefix)
 160%   Spec ::= between(Low, High)     % Numerical between
 161%   Spec ::= ge(High)               % Numerical greater-equal
 162%   Spec ::= le(Low)                % Numerical less-equal
 163%   Spec ::= Token
 164%   ==
 165%
 166%   sounds(Like) and stem(Like) both map to  a disjunction. First we
 167%   compile the spec to normal form:   a disjunction of conjunctions
 168%   on elementary tokens. Then we execute   all the conjunctions and
 169%   generate the union using ordered-set algorithms.
 170%
 171%   Stopgaps are ignored. If the final result is only a stopgap, the
 172%   predicate fails.
 173%
 174%   @tbd Exploit ordering of numbers and allow for > N, < N, etc.
 175
 176rdf_find_literal(Spec, Literal) :-
 177    rdf_find_literals(Spec, Literals),
 178    member(Literal, Literals).
 179
 180rdf_find_literals(Spec, Literals) :-
 181    compile_spec(Spec, DNF),
 182    DNF \== @(stopgap),
 183    token_index(Map),
 184    lookup(DNF, Map, _, SuperSet),
 185    flatten(SuperSet, Set0),
 186    sort(Set0, Literals).
 187
 188%!  rdf_token_expansions(+Spec, -Extensions)
 189%
 190%   Determine which extensions of  a   token  contribute  to finding
 191%   literals.
 192
 193rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
 194    token_index(Map),
 195    rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
 196rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
 197    metaphone_index(Map),
 198    rdf_find_literal_map(Map, [Like], Tokens).
 199rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
 200    stem_index(Map),
 201    rdf_find_literal_map(Map, [Like], Tokens).
 202rdf_token_expansions(Spec, Expansions) :-
 203    compile_spec(Spec, DNF),
 204    token_index(Map),
 205    lookup(DNF, Map, SCS, _),
 206    flatten(SCS, CS),
 207    sort(CS, Expansions0),
 208    join_expansions(Expansions0, Expansions).
 209
 210join_expansions([], []).
 211join_expansions([H0|T0], [H|T]) :-
 212    untag(H0, Tag, V0),
 213    Tag =.. L0,
 214    append(L0, [[V0|Values]], L1),
 215    H =.. L1,
 216    join_expansions_by_tag(T0, Tag, T1, Values),
 217    join_expansions(T1, T).
 218
 219join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
 220    untag(H, Tag, V0),
 221    !,
 222    join_expansions_by_tag(T0, Tag, T, VT).
 223join_expansions_by_tag(L, _, L, []).
 224
 225lookup(@(false), _, [], []) :- !.
 226lookup(or(H0,T0), Map, [CH|CT], [H|T]) :-
 227    !,
 228    lookup(H0, Map, CH, H),
 229    lookup(T0, Map, CT, T).
 230lookup(H0, Map, [C], [H]) :-
 231    lookup1(H0, Map, C, H).
 232
 233lookup1(Conj, Map, Cond, Literals) :-
 234    phrase(conj_to_list(Conj), List),
 235    !,
 236    rdf_find_literal_map(Map, List, Literals),
 237    (   Literals \== []
 238    ->  phrase(conj_to_cond(Conj), Cond)
 239    ;   Cond = []
 240    ).
 241lookup1(_, _, _, []).
 242
 243conj_to_list(and(A,B)) -->
 244    !,
 245    conj_to_list(A),
 246    conj_to_list(B).
 247conj_to_list(@(false)) -->
 248    !,
 249    {fail}.
 250conj_to_list(Tagged) -->
 251    { untag(Tagged, L) },
 252    !,
 253    [L].
 254conj_to_list(L) -->
 255    [L].
 256
 257
 258conj_to_cond(and(A,B)) -->
 259    !,
 260    conj_to_cond(A),
 261    conj_to_cond(B).
 262conj_to_cond(Tagged) -->
 263    { untag(Tagged, _) },
 264    !,
 265    [ Tagged ].
 266conj_to_cond(_) -->
 267    [].
 268
 269
 270%!  compile_spec(+Spec, -Compiled)
 271%
 272%   Compile a specification as above into disjunctive normal form
 273
 274compile_spec(Spec, DNF) :-
 275    expand_fuzzy(Spec, Spec2),
 276    nnf(Spec2, NNF),
 277    dnf(NNF, DNF).
 278
 279
 280expand_fuzzy(Var, _) :-
 281    var(Var),
 282    !,
 283    throw(error(instantiation_error, _)).
 284expand_fuzzy(sounds(Like), Or) :-
 285    !,
 286    (   atom(Like)
 287    ->  metaphone_index(Map),
 288        double_metaphone(Like, Key),
 289        rdf_find_literal_map(Map, [Key], Tokens),
 290        list_to_or(Tokens, sounds(Like), Or)
 291    ;   expand_fuzzy(Like, Or)
 292    ).
 293expand_fuzzy(stem(Like), Or) :-
 294    !,
 295    expand_fuzzy(stem(Like, en), Or).
 296expand_fuzzy(stem(Like, Lang), Or) :-
 297    !,
 298    (   atom(Like)
 299    ->  stem_index(Map),
 300        stem(Like, Lang, Key),
 301        rdf_find_literal_map(Map, [Key], Tokens),
 302        list_to_or(Tokens, stem(Like), Or)
 303    ;   expand_fuzzy(Like, Or)
 304    ).
 305expand_fuzzy(prefix(Prefix), Or) :-
 306    !,
 307    (   atom(Prefix)
 308    ->  token_index(Map),
 309        rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
 310        list_to_or(Tokens, prefix(Prefix), Or)
 311    ;   expand_fuzzy(Prefix, Or)
 312    ).
 313expand_fuzzy(case(String), Or) :-
 314    !,
 315    (   atom(String)
 316    ->  token_index(Map),
 317        rdf_keys_in_literal_map(Map, case(String), Tokens),
 318        list_to_or(Tokens, case(String), Or)
 319    ;   expand_fuzzy(String, Or)
 320    ).
 321expand_fuzzy(or(A0, B0), E) :-
 322    !,
 323    expand_fuzzy(A0, A),
 324    expand_fuzzy(B0, B),
 325    simplify(or(A,B), E).
 326expand_fuzzy(and(A0, B0), E) :-
 327    !,
 328    expand_fuzzy(A0, A),
 329    expand_fuzzy(B0, B),
 330    simplify(and(A,B), E).
 331expand_fuzzy(not(A0), not(A)) :-
 332    !,
 333    expand_fuzzy(A0, A).
 334expand_fuzzy(between(Low, High), Or) :-
 335    !,
 336    token_index(Map),
 337    rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
 338    list_to_or(Tokens, between(Low, High), Or).
 339expand_fuzzy(le(High), Or) :-
 340    !,
 341    token_index(Map),
 342    rdf_keys_in_literal_map(Map, le(High), Tokens),
 343    list_to_or(Tokens, le(High), Or).
 344expand_fuzzy(ge(Low), Or) :-
 345    !,
 346    token_index(Map),
 347    rdf_keys_in_literal_map(Map, ge(Low), Tokens),
 348    list_to_or(Tokens, ge(Low), Or).
 349expand_fuzzy(Token, Result) :-
 350    atomic(Token),
 351    !,
 352    (   rdf_stopgap_token(Token)
 353    ->  Result = @(stopgap)
 354    ;   Result = Token
 355    ).
 356expand_fuzzy(Token, _) :-
 357    throw(error(type_error(Token, boolean_expression), _)).
 358
 359simplify(Expr0, Expr) :-
 360    simple(Expr0, Expr),
 361    !.
 362simplify(Expr, Expr).
 363
 364simple(and(@(false), _), @(false)).
 365simple(and(_, @(false)), @(false)).
 366simple(and(@(stopgap), Token), Token).
 367simple(and(Token, @(stopgap)), Token).
 368simple(or(@(false), X), X).
 369simple(or(X, @(false)), X).
 370simple(or(@(stopgap), Token), Token).
 371simple(or(Token, @(stopgap)), Token).
 372
 373
 374list_to_or([], _, @(false)) :- !.
 375list_to_or([X], How, One) :-
 376    !,
 377    tag(How, X, One).
 378list_to_or([H0|T0], How, or(H, T)) :-
 379    tag(How, H0, H),
 380    list_to_or(T0, How, T).
 381
 382tag(sounds(X),    Y, sounds(X,Y)).
 383tag(stem(X),      Y, stem(X,Y)).
 384tag(prefix(X),    Y, prefix(X,Y)).
 385tag(case(X),      Y, case(X,Y)).
 386tag(between(L,H), Y, between(L,H,Y)).
 387tag(ge(L),        Y, ge(L,Y)).
 388tag(le(H),        Y, le(H,Y)).
 389
 390untag(sounds(_,Y),    Y).
 391untag(stem(_,Y),      Y).
 392untag(prefix(_,Y),    Y).
 393untag(case(_,Y),      Y).
 394untag(between(_,_,Y), Y).
 395untag(le(_,Y),        Y).
 396untag(ge(_,Y),        Y).
 397
 398untag(sounds(X,Y),    sounds(X),    Y).
 399untag(stem(X,Y),      stem(X),      Y).
 400untag(prefix(X,Y),    prefix(X),    Y).
 401untag(case(X,Y),      case(X),      Y).
 402untag(between(L,H,Y), between(L,H), Y).
 403untag(ge(L,Y),        ge(L),        Y).
 404untag(le(H,Y),        le(H),        Y).
 405
 406
 407%!  nnf(+Formula, -NNF)
 408%
 409%   Rewrite to Negative Normal Form, meaning negations only appear
 410%   around literals.
 411
 412nnf(not(not(A0)), A) :-
 413    !,
 414    nnf(A0, A).
 415nnf(not(and(A0,B0)), or(A,B)) :-
 416    !,
 417    nnf(not(A0), A),
 418    nnf(not(B0), B).
 419nnf(not(or(A0,B0)), and(A,B)) :-
 420    !,
 421    nnf(not(A0), A),
 422    nnf(not(B0), B).
 423nnf(A, A).
 424
 425
 426%!  dnf(+NNF, -DNF)
 427%
 428%   Convert a formula in NNF to Disjunctive Normal Form (DNF)
 429
 430dnf(or(A0,B0), or(A, B)) :-
 431    !,
 432    dnf(A0, A),
 433    dnf(B0, B).
 434dnf(and(A0,B0), DNF):-
 435    !,
 436    dnf(A0, A1),
 437    dnf(B0, B1),
 438    dnf1(and(A1,B1), DNF).
 439dnf(DNF, DNF).
 440
 441dnf1(and(A0, or(B,C)), or(P,Q)) :-
 442    !,
 443    dnf1(and(A0,B), P),
 444    dnf1(and(A0,C), Q).
 445dnf1(and(or(B,C), A0), or(P,Q)) :-
 446    !,
 447    dnf1(and(A0,B), P),
 448    dnf1(and(A0,C), Q).
 449dnf1(DNF, DNF).
 450
 451
 452                 /*******************************
 453                 *          TOKEN INDEX         *
 454                 *******************************/
 455
 456%!  token_index(-Map)
 457%
 458%   Get the index of tokens. If  not   present,  create one from the
 459%   current database. Once created, the map is kept up-to-date using
 460%   a monitor hook.
 461
 462token_index(Map) :-
 463    literal_map(token, Map),
 464    !,
 465    wait_for_map(token).
 466token_index(Map) :-
 467    rdf_new_literal_map(Map),
 468    assert(literal_map(token, Map)),
 469    register_token_updater,
 470    message_queue_create(Queue),
 471    assert(map_building(token, Queue)),
 472    thread_create(make_literal_index(Queue), _,
 473                  [ alias('__rdf_tokenizer'),
 474                    detached(true)
 475                  ]),
 476    wait_for_map(token).
 477
 478register_token_updater :-
 479    Monitor = [ reset,
 480                new_literal,
 481                old_literal
 482              ],
 483    (   setting(index(default))
 484    ->  create_update_literal_thread(1),
 485        rdf_monitor(thread_monitor_literal, Monitor)
 486    ;   setting(index(thread(N)))
 487    ->  create_update_literal_thread(N),
 488        rdf_monitor(thread_monitor_literal, Monitor)
 489    ;   rdf_monitor(monitor_literal, Monitor)
 490    ).
 491
 492make_literal_index(Queue) :-
 493    call_cleanup(
 494        make_literal_index,
 495        ( message_queue_destroy(Queue),
 496          retractall(map_building(token, _)))).
 497
 498%!  make_literal_index
 499%
 500%   Create the initial literal index.
 501
 502make_literal_index :-
 503    setting(index_threads(N)),
 504    !,
 505    threaded_literal_index(N),
 506    verbose('~N', []).
 507make_literal_index :-
 508    current_prolog_flag(cpu_count, X),
 509    threaded_literal_index(X),
 510    verbose('~N', []).
 511
 512threaded_literal_index(N) :-
 513    N > 1,
 514    !,
 515    message_queue_create(Q, [max_size(1000)]),
 516    create_index_threads(N, Q, Ids),
 517    forall(rdf_current_literal(Literal),
 518           thread_send_message(Q, Literal)),
 519    forall(between(1, N, _),
 520           thread_send_message(Q, done(true))),
 521    maplist(thread_join, Ids, _).
 522threaded_literal_index(_) :-
 523    forall(rdf_current_literal(Literal),
 524           register_literal(Literal)).
 525
 526create_index_threads(N, Q, [Id|T]) :-
 527    N > 0,
 528    !,
 529    thread_create(index_worker(Q), Id, []),
 530    N2 is N - 1,
 531    create_index_threads(N2, Q, T).
 532create_index_threads(_, _, []) :- !.
 533
 534index_worker(Queue) :-
 535    repeat,
 536        thread_get_message(Queue, Msg),
 537        work(Msg).
 538
 539work(done(true)) :- !.
 540work(Literal) :-
 541    register_literal(Literal),
 542    fail.
 543
 544
 545%!  clean_token_index
 546%
 547%   Clean after a reset.
 548
 549clean_token_index :-
 550    forall(literal_map(_, Map),
 551           rdf_reset_literal_map(Map)),
 552    retractall(stopgap(_)).
 553
 554%!  rdf_delete_literal_index(+Type)
 555%
 556%   Fully delete a literal index
 557
 558rdf_delete_literal_index(Type) :-
 559    must_be(atom, Type),
 560    (   retract(literal_map(Type, Map))
 561    ->  rdf_reset_literal_map(Map)          % destroy is unsafe
 562    ).
 563
 564                 /*******************************
 565                 *        THREADED UPDATE       *
 566                 *******************************/
 567
 568%!  create_update_literal_thread(+Threads)
 569%
 570%   Setup literal monitoring using threads.  While loading databases
 571%   through rdf_attach_db/2 from  rdf_persistency.pl,   most  of the
 572%   time is spent updating the literal token database. While loading
 573%   the RDF triples, most of the time   is spend in updating the AVL
 574%   tree holding the literals. Updating  the   token  index hangs on
 575%   updating the AVL trees holding the   tokens.  Both tasks however
 576%   can run concurrently.
 577
 578create_update_literal_thread(Threads) :-
 579    message_queue_create(_,
 580                         [ alias(rdf_literal_monitor_queue),
 581                           max_size(50000)
 582                         ]),
 583    forall(between(1, Threads, _),
 584           create_index_worker(initial)).
 585
 586:- dynamic
 587    index_worker_id/1,
 588    extra_worker_count/1.
 589
 590create_index_worker(Status) :-
 591    (   retract(index_worker_id(Id0))
 592    ->  true
 593    ;   Id0 = 1
 594    ),
 595    succ(Id0, Id1),
 596    assertz(index_worker_id(Id1)),
 597    atom_concat(rdf_literal_monitor_, Id0, Alias),
 598    inc_extra_worker_count(Status),
 599    thread_create(monitor_literals(Status), _,
 600                  [ alias(Alias)
 601                  ]).
 602
 603monitor_literals(initial) :-
 604    set_prolog_flag(agc_margin, 0), % we don't create garbage
 605    repeat,
 606        thread_get_message(rdf_literal_monitor_queue, Literal),
 607        register_literal(Literal),
 608    fail.
 609monitor_literals(extra) :-
 610    set_prolog_flag(agc_margin, 0),
 611    repeat,
 612        (   thread_get_message(rdf_literal_monitor_queue, Literal,
 613                               [ timeout(1)
 614                               ])
 615        ->  register_literal(Literal),
 616            fail
 617        ;   !
 618        ),
 619    with_mutex(create_index_worker, dec_extra_worker_count),
 620    thread_self(Me),
 621    thread_detach(Me).
 622
 623thread_monitor_literal(new_literal(Literal)) :-
 624    !,
 625    thread_send_message(rdf_literal_monitor_queue, Literal).
 626thread_monitor_literal(Action) :-
 627    !,
 628    monitor_literal(Action).
 629
 630%!  check_index_workers(+Queue, +Keys)
 631%
 632%   Increase the number of workers indexing   literals sent to Queue
 633%   if the queue gets overful.
 634
 635check_index_workers(Alias, Keys) :-
 636    max_extra_workers(Max),
 637    Max > 0,
 638    message_queue_property(Queue, alias(Alias)),
 639    message_queue_property(Queue, size(Size)),
 640    Size > 10000,
 641    \+ ( extra_worker_count(Extra),
 642         Extra >= Max
 643       ),
 644    !,
 645    debug(rdf_litindex,
 646          'Creating extra literal indexer (Queue=~D, Keys=~D)',
 647          [Size, Keys]),
 648    with_mutex(create_index_worker, create_index_worker(extra)).
 649check_index_workers(_, _).
 650
 651inc_extra_worker_count(extra) :-
 652    !,
 653    (   retract(extra_worker_count(C0))
 654    ->  C is C0+1
 655    ;   C = 1
 656    ),
 657    asserta(extra_worker_count(C)).
 658inc_extra_worker_count(_).
 659
 660dec_extra_worker_count :-
 661    retract(extra_worker_count(C0)),
 662    !,
 663    C is C0-1,
 664    asserta(extra_worker_count(C)).
 665dec_extra_worker_count.
 666
 667max_extra_workers(Max) :-
 668    current_prolog_flag(cpu_count, Count),
 669    Max is Count//2.
 670
 671
 672                 /*******************************
 673                 *       MONITORED UPDATE       *
 674                 *******************************/
 675
 676monitor_literal(new_literal(Literal)) :-
 677    register_literal(Literal).
 678monitor_literal(old_literal(Literal)) :-
 679    unregister_literal(Literal).
 680monitor_literal(transaction(begin, reset)) :-
 681    rdf_monitor(monitor_literal, [-old_literal]),
 682    clean_token_index.
 683monitor_literal(transaction(end, reset)) :-
 684    rdf_monitor(monitor_literal, [+old_literal]).
 685
 686%!  register_literal(+Literal)
 687%
 688%   Associate the tokens of a literal with the literal itself.
 689
 690register_literal(Literal) :-
 691    (   rdf_tokenize_literal(Literal, Tokens0)
 692    ->  sort(Tokens0, Tokens),
 693        text_of(Literal, Lang, Text),
 694        literal_map(token, Map),
 695        add_tokens(Tokens, Lang, Text, Map)
 696    ;   true
 697    ).
 698
 699add_tokens([], _, _, _).
 700add_tokens([H|T], Lang, Literal, Map) :-
 701    rdf_insert_literal_map(Map, H, Literal, Keys),
 702    (   var(Keys)
 703    ->  (   rdf_keys_in_literal_map(Map, key(H), Count),
 704            setting(stopgap_threshold(Threshold)),
 705            Count > Threshold
 706        ->  assert(stopgap(H)),
 707            rdf_delete_literal_map(Map, H)
 708        ;   true
 709        )
 710    ;   forall(new_token(H, Lang), true),
 711        (   Keys mod 1000 =:= 0
 712        ->  progress(Map, 'Tokens'),
 713            (   Keys mod 10000 =:= 0
 714            ->  check_index_workers(rdf_literal_monitor_queue, Keys)
 715            ;   true
 716            )
 717        ;   true
 718        )
 719    ),
 720    add_tokens(T, Lang, Literal, Map).
 721
 722
 723%!  unregister_literal(+Literal)
 724%
 725%   Literal is removed from the database.   As we abstract from lang
 726%   and type qualifiers we first have to  check this is the last one
 727%   that is destroyed.
 728
 729unregister_literal(Literal) :-
 730    text_of(Literal, _Lang, Text),
 731    (   rdf(_,_,literal(Text))
 732    ->  true                        % still something left
 733    ;   rdf_tokenize_literal(Literal, Tokens0),
 734        sort(Tokens0, Tokens),
 735        literal_map(token, Map),
 736        del_tokens(Tokens, Text, Map)
 737    ).
 738
 739del_tokens([], _, _).
 740del_tokens([H|T], Literal, Map) :-
 741    rdf_delete_literal_map(Map, H, Literal),
 742    del_tokens(T, Literal, Map).
 743
 744
 745%!  rdf_tokenize_literal(+Literal, -Tokens) is semidet.
 746%
 747%   Tokenize a literal. We make  this   hookable  as tokenization is
 748%   generally domain dependent.
 749
 750rdf_tokenize_literal(Literal, Tokens) :-
 751    tokenization(Literal, Tokens),
 752    !.               % Hook
 753rdf_tokenize_literal(Literal, Tokens) :-
 754    text_of(Literal, _Lang, Text),
 755    atom(Text),
 756    tokenize_atom(Text, Tokens0),
 757    select_tokens(Tokens0, Tokens).
 758
 759select_tokens([], []).
 760select_tokens([H|T0], T) :-
 761    (   exclude_from_index(token, H)
 762    ->  select_tokens(T0, T)
 763    ;   number(H)
 764    ->  (   integer(H),
 765            between(-1073741824, 1073741823, H)
 766        ->  T = [H|T1],
 767            select_tokens(T0, T1)
 768        ;   select_tokens(T0, T)
 769        )
 770    ;   atom_length(H, 1)
 771    ->  select_tokens(T0, T)
 772    ;   default_stopgap(H)
 773    ->  select_tokens(T0, T)
 774    ;   stopgap(H)
 775    ->  select_tokens(T0, T)
 776    ;   T = [H|T1],
 777        select_tokens(T0, T1)
 778    ).
 779
 780%!  rdf_stopgap_token(-Token) is nondet.
 781%
 782%   True when Token is a stopgap  token. Currently, this implies one
 783%   of:
 784%
 785%     - exclude_from_index(token, Token) is true
 786%     - default_stopgap(Token) is true
 787%     - Token is an atom of length 1
 788%     - Token was added to the dynamic stopgap token set because
 789%       it appeared in more than _stopgap_threshold_ literals.
 790
 791rdf_stopgap_token(Token) :-
 792    (   var(Token)
 793    ->  rdf_stopgap_token2(Token)
 794    ;   rdf_stopgap_token2(Token), !
 795    ).
 796
 797rdf_stopgap_token2(Token) :-
 798    exclude_from_index(token, Token).
 799rdf_stopgap_token2(Token) :-
 800    default_stopgap(Token).
 801rdf_stopgap_token2(Token) :-
 802    atom(Token),
 803    atom_length(Token, 1).
 804rdf_stopgap_token2(Token) :-
 805    stopgap(Token).
 806
 807%!  default_stopgap(?Token)
 808%
 809%   Tokens we do not wish to index,   as  they creat huge amounts of
 810%   data with little or no value.  Is   there  a more general way to
 811%   describe this? Experience shows that simply  word count is not a
 812%   good criterium as it often rules out popular domain terms.
 813
 814default_stopgap(and).
 815default_stopgap(an).
 816default_stopgap(or).
 817default_stopgap(of).
 818default_stopgap(on).
 819default_stopgap(in).
 820default_stopgap(this).
 821default_stopgap(the).
 822
 823
 824%!  text_of(+LiteralArg, -Lang, -Text) is semidet.
 825%
 826%   Get the textual  or  (integer)   numerical  information  from  a
 827%   literal value. Lang  is  the  language   to  use  for  stemming.
 828%   Currently we use English for untyped  plain literals or literals
 829%   typed xsd:string. Formally, these should not be tokenized, but a
 830%   lot of data out there does not tag strings with their language.
 831
 832text_of(type(xsd:string, Text), en, Text) :- !.
 833text_of(type(_, Text), -, Text) :- !.
 834text_of(lang(Lang, Text), Lang, Text) :- !.
 835text_of(Text, en, Text) :- atom(Text), !.
 836text_of(Text, -, Text) :- integer(Text).
 837
 838
 839                 /*******************************
 840                 *         STEM INDEX           *
 841                 *******************************/
 842
 843%!  stem_index(-Map) is det.
 844%
 845%   Get the stemming literal index. This index is created on demand.
 846%   If some thread is creating the index, other threads wait for its
 847%   completion.
 848
 849stem_index(Map) :-
 850    literal_map(stem, Map),
 851    !,
 852    wait_for_map(stem).
 853stem_index(Map) :-
 854    rdf_new_literal_map(Map),
 855    assert(literal_map(stem, Map)),
 856    assert((new_token(Token, Lang) :- add_stem(Token, Lang, Map))),
 857    message_queue_create(Queue),
 858    assert(map_building(stem, Queue)),
 859    thread_create(fill_stem_index(Map, Queue), _,
 860                  [ alias('__rdf_stemmer'),
 861                    detached(true)
 862                  ]),
 863    wait_for_map(stem).
 864
 865wait_for_map(MapName) :-
 866    (   map_building(MapName, Queue)
 867    ->  catch(thread_get_message(Queue, _), _, true),
 868        wait_for_map(MapName)
 869    ;   true
 870    ).
 871
 872fill_stem_index(StemMap, Queue) :-
 873    call_cleanup(
 874        forall(rdf_current_literal(Literal),
 875               stem_literal_tokens(Literal, StemMap)),
 876        ( message_queue_destroy(Queue),
 877          retractall(map_building(stem, _)))).
 878
 879stem_literal_tokens(Literal, StemMap) :-
 880    rdf_tokenize_literal(Literal, Tokens),
 881    !,
 882    sort(Tokens, Tokens1),
 883    text_of(Literal, Lang, _Text),
 884    insert_tokens_stem(Tokens1, Lang, StemMap).
 885stem_literal_tokens(_,_).
 886
 887insert_tokens_stem([], _, _).
 888insert_tokens_stem([Token|T], Lang, Map) :-
 889    (   atom(Token)
 890    ->  (   stem(Token, Lang, Stem)
 891        ->  rdf_insert_literal_map(Map, Stem, Token, Keys),
 892            (   integer(Keys),
 893                Keys mod 1000 =:= 0
 894            ->  progress(Map, 'Stem')
 895            ;   true
 896            )
 897        ;   true
 898        )
 899    ;   true
 900    ),
 901    insert_tokens_stem(T, Lang, Map).
 902
 903
 904add_stem(Token, Lang, Map) :-
 905    stem(Lang, Token, Stem),
 906    rdf_insert_literal_map(Map, Stem, Token, _).
 907
 908:- if(current_predicate(snowball/3)).
 909stem(Token, LangSpec, Stem) :-
 910    main_lang(LangSpec, Lang),
 911    downcase_atom(Token, Lower),
 912    catch(snowball(Lang, Lower, Stem), _, fail).
 913:- else.
 914stem(Token, _Lang, Stem) :-
 915    downcase_atom(Token, Lower),
 916    porter_stem(Lower, Stem).
 917:- endif.
 918
 919main_lang(LangSpec, Lang) :-
 920    sub_atom(LangSpec, Before, _, _, -),
 921    !,
 922    sub_atom(LangSpec, 0, Before, _, Lang).
 923main_lang(LangSpec, Lang) :-
 924    downcase_atom(LangSpec, Lang).
 925
 926
 927                 /*******************************
 928                 *        METAPHONE INDEX       *
 929                 *******************************/
 930
 931
 932metaphone_index(Map) :-
 933    literal_map(metaphone, Map),
 934    !,
 935    wait_for_map(metaphone).
 936metaphone_index(Map) :-
 937    rdf_new_literal_map(Map),
 938    assert(literal_map(metaphone, Map)),
 939    assert((new_token(Token, Lang) :- add_metaphone(Token, Lang, Map))),
 940    message_queue_create(Queue),
 941    assert(map_building(metaphone, Queue)),
 942    thread_create(fill_metaphone_index(Map, Queue), _,
 943                  [ alias('__rdf_metaphone_indexer'),
 944                    detached(true)
 945                  ]),
 946    wait_for_map(metaphone).
 947
 948fill_metaphone_index(MetaphoneMap, Queue) :-
 949    call_cleanup(
 950        fill_metaphone_index(MetaphoneMap),
 951        ( message_queue_destroy(Queue),
 952          retractall(map_building(metaphone, _)))).
 953
 954fill_metaphone_index(MetaphoneMap) :-
 955    token_index(TokenMap),
 956    rdf_keys_in_literal_map(TokenMap, all, Tokens),
 957    metaphone(Tokens, MetaphoneMap).
 958
 959metaphone([], _).
 960metaphone([Token|T], Map) :-
 961    (   atom(Token),
 962        double_metaphone(Token, SoundEx)
 963    ->  rdf_insert_literal_map(Map, SoundEx, Token, Keys),
 964        (   integer(Keys),
 965            Keys mod 1000 =:= 0
 966        ->  progress(Map, 'Metaphone')
 967        ;   true
 968        )
 969    ;   true
 970    ),
 971    metaphone(T, Map).
 972
 973
 974add_metaphone(Token, _Lang, Map) :-
 975    atom(Token),
 976    !,
 977    double_metaphone(Token, SoundEx),
 978    rdf_insert_literal_map(Map, SoundEx, Token).
 979add_metaphone(_, _, _).
 980
 981%!  rdf_literal_index(+Type, -Index) is det.
 982%
 983%   True when Index is a literal map   containing the index of Type.
 984%   Type is one of:
 985%
 986%     - token
 987%     Tokens are basically words of literal values. See
 988%     rdf_tokenize_literal/2.  The `token` map maps tokens to full
 989%     literal texts.
 990%     - stem
 991%     Index of stemmed tokens.  If the language is available, the
 992%     tokens are stemmed using the matching _snowball_ stemmer.
 993%     The `stem` map maps stemmed to full tokens.
 994%     - metaphone
 995%     Phonetic index of tokens.  The `metaphone` map maps phonetic
 996%     keys to tokens.
 997
 998rdf_literal_index(token, Map) :-
 999    !,
1000    token_index(Map).
1001rdf_literal_index(stem, Map) :-
1002    !,
1003    stem_index(Map).
1004rdf_literal_index(metaphone, Map) :-
1005    !,
1006    metaphone_index(Map).
1007rdf_literal_index(Type, _Map) :-
1008    domain_error(literal_index, Type).
1009
1010
1011                 /*******************************
1012                 *             UTIL             *
1013                 *******************************/
1014
1015verbose(Fmt, Args) :-
1016    setting(verbose(true)),
1017    !,
1018    format(user_error, Fmt, Args).
1019verbose(_, _).
1020
1021progress(Map, Which) :-
1022    setting(verbose(true)),
1023    !,
1024    rdf_statistics_literal_map(Map, size(Keys, Values)),
1025    format(user_error,
1026           '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
1027           [Which, Keys, Values]).
1028progress(_,_).