lib/perl/Query.pm

   1 #       Perl module for sending queries to Sherlock search servers and parsing answers
   2 #
   3 #       (c) 2002--2003 Martin Mares <mj@ucw.cz>
   4 #
   5 #       This software may be freely distributed and used according to the terms
   6 #       of the GNU Lesser General Public License.
   7
   8 =head1 NAME
   9
  10 Sherlock::Query -- Communication with Sherlock Search Servers
  11
  12 =head1 DESCRIPTION
  13
  14 This perl library offers a simple interface for connecting to Sherlock
  15 search servers, sending queries or control commands and parsing the
  16 results.
  17
  18 First of all, you have to use
  19
  20         my $conn = new Sherlock::Query('server:port');
  21
  22 to create a new connection object (unconnected yet). Then you can call
  23
  24         my $res = $conn->command('command');
  25
  26 to establish the connection, send a given command to the search server
  27 and gather the results (see below) or, if you want to send a normal query,
  28
  29         my $res = $conn->query('"simple" OR "query"');
  30
  31 which does the same as C<< $conn->command(...) >>, but it also parses the
  32 results to a representation convenient for handling in Perl programs
  33 (again, see below).
  34
  35 Currently, you can use a single connection to send only a single command or query.
  36
  37 =head1 RESULTS
  38
  39 The I<raw answer> of the search server (i.e., the lines it has returned) is always
  40 available as C<< $conn->{RAW} >> as a list of strings, each representing a single
  41 line.
  42
  43 Parsed results of queries are stored in a more complicated way, but before
  44 explaining it, let's mention a couple of axioms: Any search server I<object>
  45 (header, footer, a single document of answer) is always stored as a hash keyed
  46 by attribute names. Ordinary single-valued attributes are stored as strings,
  47 multi-valued attributes as (references to) arrays of strings. When an object
  48 contains sub-objects, they are stored as references to other hashes, possibly
  49 encapsulated within a list if there can be multiple such objects. Most objects
  50 have an extra attribute C<RAW> containing the original description of the
  51 object, a sub-list of C<< $conn->{RAW} >>.
  52
  53 The parsed answer consists of three parts (please follow F<doc/search> to
  54 get a better picture of what does the server answer): header C<< $conn->{HEADER} >>
  55 (an object, as described above), footer C<< $conn->{FOOTER} >> (object) and document
  56 cards C<< $conn->{CARDS} >> (a list of objects).
  57
  58 The I<header> contains all the standard header attributes and also C<< $hdr->{D} >>
  59 which is a list of sub-objects, each corresponding to a single database and
  60 containing per-database attributes like C<W> (word list).
  61
  62 The I<footer> is pretty straightforward and it just contains what you'd
  63 expect it to.
  64
  65 Each I<card> contains the usual document attributes (see F<doc/objects> for
  66 a list) plus C<< $card->{U} >> which is a list of sub-objects corresponding
  67 to URL's of the document and containing per-URL attributes like C<U> (URL),
  68 C<s> (original size) and C<T> (content type).
  69
  70 When in doubt, call the C<print> method which will print the whole contents
  71 of the connection object. It's actually a much more general (but pretty
  72 simple due to Perl being able to be a very introspective language) routine
  73 usable for dumping any acyclic Perl data structure composed of strings,
  74 hashes, arrays and references to them. You can access this general routine
  75 by calling C<format({ print; }, $what)> which dumps C<$what> and for
  76 each line of output it calls the given subroutine.
  77
  78 =head1 SEE ALSO
  79
  80 A good example of use of this module is the C<query> utility and
  81 of course the example front-end (F<front-end/query.cgi>).
  82
  83 =head1 AUTHOR
  84
  85 Martin Mares <mj@ucw.cz>
  86
  87 =cut
  88
  89 package Sherlock::Query;
  90
  91 use strict;
  92 use warnings;
  93 use IO::Socket::INET;
  94
  95 sub parse_tree($$);
  96 sub do_parse_tree($$$$);
  97 sub format_tree($$$);
  98
  99 sub new($$) {
 100         my $class = shift @_;
 101         my $server = shift @_;
 102         my $self = {
 103                 SERVER  => $server
 104         };
 105         bless $self;
 106         return $self;
 107 }
 108
 109 sub command($$) {
 110         my ($q,$string) = @_;
 111
 112         $q->{RAW} = [];
 113
 114         my $sock = IO::Socket::INET->new(PeerAddr => $q->{SERVER}, Proto => 'tcp')
 115                 or return "-900 Cannot connect to search server: $!";
 116         print $sock $string, "\n";
 117
 118         # Status line
 119         my $stat = <$sock>;
 120         $stat = "-903 Incomplete reply" if !defined $stat;
 121         chomp $stat;
 122         $stat =~ /^[+-]/ or return "-901 Reply parse error";
 123
 124         # Blocks of output
 125         my $block = undef;
 126         for(;;) {
 127                 my $res = <$sock>;
 128                 last if $sock->eof;
 129                 chomp $res;
 130                 if ($res eq "") {
 131                         $block = undef;
 132                 } else {
 133                         if (!defined $block) {
 134                                 $block = [];
 135                                 push @{$q->{RAW}}, $block;
 136                         }
 137                         push @$block, $res;
 138                 }
 139         }
 140
 141         return $stat;
 142 }
 143
 144 our $hdr_syntax = {
 145         '(D' => {
 146                 'W' => [],
 147                 'P' => [],
 148                 'n' => [],
 149                 '.' => [],
 150         },
 151         '.' => [],
 152 };
 153
 154 our $card_syntax = {
 155         '(U' => {
 156                 'V' => [],
 157                 'y' => [],
 158                 'E' => [],
 159         },
 160         'M' => [],
 161         'X' => [],
 162 };
 163
 164 our $footer_syntax = {
 165 };
 166
 167 sub query($$) {
 168         my ($q,$string) = @_;
 169
 170         # Send the query and gather results
 171         my $stat = $q->command($string);
 172         my @raw = @{$q->{RAW}};
 173
 174         # Split results to header, cards and footer
 175         $q->{HEADER} = { RAW => [] };
 176         if (@raw) { $q->{HEADER}{RAW} = shift @raw; }
 177         elsif (!$stat) { return "-902 Incomplete reply"; }
 178         $q->{FOOTER} = { RAW => [] };
 179         if (@raw && $raw[@raw-1]->[0] =~ /^\+/) {
 180                 $q->{FOOTER}{RAW} = pop @raw;
 181         }
 182         $q->{CARDS} = [];
 183         while (@raw) {
 184                 push @{$q->{CARDS}}, { RAW => shift @raw };
 185         }
 186
 187         # Parse everything
 188         parse_tree($q->{HEADER}, $hdr_syntax);
 189         foreach my $c (@{$q->{CARDS}}) {
 190                 parse_tree($c, $card_syntax);
 191         }
 192         parse_tree($q->{FOOTER}, $footer_syntax);
 193
 194         return $stat;
 195 }
 196
 197 sub parse_tree($$) {
 198         my $tree = shift @_;
 199         my $syntax = shift @_;
 200         do_parse_tree($tree->{RAW}, 0, $tree, $syntax);
 201 }
 202
 203 sub do_parse_tree($$$$) {
 204         my $raw = shift @_;
 205         my $i = shift @_;
 206         my $cooked = shift @_;
 207         my $syntax = shift @_;
 208
 209         while ($i < @$raw) {
 210                 $raw->[$i] =~ /^([^(]|\(.)(.*)/;
 211                 if ($1 eq ")") {
 212                         return $i;
 213                 } elsif (!defined($syntax->{$1})) {
 214                         $cooked->{$1} = $2 if !defined($cooked->{$1});
 215                         $i++;
 216                 } elsif (ref $syntax->{$1} eq "ARRAY") {
 217                         push @{$cooked->{$1}}, $2;
 218                         $i++;
 219                 } elsif (ref $syntax->{$1} eq "HASH") {
 220                         my $block = {};
 221                         push @{$cooked->{$1}}, $block;
 222                         $i = do_parse_tree($raw, $i+1, $block, $syntax->{$1});
 223                 }
 224         }
 225         return $i;
 226 }
 227
 228 sub format_tree($$$) {
 229         my ($func, $a, $indent) = @_;
 230         if (ref $a eq "ARRAY") {
 231                 if (@{$a} == 0) { &$func("[]\n"); }
 232                 else {
 233                         &$func("[\n");
 234                         foreach my $k (@{$a}) {
 235                                 &$func("$indent\t");
 236                                 format_tree($func, $k, "$indent\t");
 237                         }
 238                         &$func($indent . "]\n");
 239                 }
 240         } elsif (ref $a) {
 241                 &$func("{\n");
 242                 foreach my $k (sort keys %{$a}) {
 243                         &$func("$indent\t$k => ");
 244                         format_tree($func, $a->{$k}, "$indent\t");
 245                 }
 246                 &$func($indent . "}\n");
 247         } elsif (defined $a) {
 248                 &$func("$a\n");
 249         } else {
 250                 &$func("UNDEF\n");
 251         }
 252 }
 253
 254 sub format($&$) {
 255         my ($q, $func, $what) = @_;
 256         format_tree($func, $what, "");
 257 }
 258
 259 sub print($) {
 260         my $q = shift @_;
 261         format_tree(sub { print $_[0]; }, $q, "");
 262 }
 263
 264 1;  # OK