]> mj.ucw.cz Git - bex.git/commitdiff
bprun: Various cursed updates
authorMartin Mares <mj@ucw.cz>
Mon, 31 Oct 2011 22:31:29 +0000 (23:31 +0100)
committerMartin Mares <mj@ucw.cz>
Mon, 31 Oct 2011 22:31:29 +0000 (23:31 +0100)
NOTES
bprun
brun

diff --git a/NOTES b/NOTES
index 7b40ffd95260661b6785893fe6f203428cdb1e0c..28adf1971c959a42100a60e9bea75eeb25644a10 100644 (file)
--- a/NOTES
+++ b/NOTES
@@ -67,7 +67,7 @@ REQUEUE               Attempted to put on a queue, but it already was there
 Additional status codes sent only over status FIFO:
 
 DONE           Done with the host (job equals "-")
-INIT           Host ready, preparing to execute jobs (job equals "-")
+INIT           Host or job ready, preparing to execute jobs
 LOCKED         Host or job not available, because it is locked by another brun
 PING           Trying to ping the host (job equals "-")
 SEND           Sending job to the host
diff --git a/bprun b/bprun
index 966111aec441352958827b0d2874f31bc4a11949..0a9253350cc07a9ecc21fda3a6241a6726f8ffa7 100755 (executable)
--- a/bprun
+++ b/bprun
@@ -4,6 +4,8 @@
 
 use strict;
 use warnings;
+use feature 'switch';
+
 use Getopt::Long;
 use POSIX;
 
@@ -31,8 +33,15 @@ AMEN
 system 'screen', '-S', $screen_session, '-X', 'select', '.';
 !$? or die "Screen session $screen_session not found\n";
 
-my @machines = BEX::Config::parse_machine_list(@ARGV ? @ARGV : '*');
 my $queue = BEX::Queue->new($queue_name);
+my @machines = ();
+my %job_counter = ();
+for my $m (BEX::Config::parse_machine_list(@ARGV ? @ARGV : '*')) {
+       my @jobs = $queue->scan($m);
+       @jobs or next;
+       push @machines, $m;
+       $job_counter{$m} = @jobs;
+}
 
 my $fifo_name = $queue->{'Name'} . '/status-fifo';
 unlink $fifo_name;
@@ -47,8 +56,6 @@ my $max = $BEX::Config::max_parallel_jobs;
 while (keys %running || @machines) {
        if (@machines && keys %running < $max) {
                my $mach = shift @machines;
-               my @jobs = $queue->scan($mach);
-               @jobs or next;
                $ui->update($mach, undef, 'START');
                my @scr = ('screen', '-t', $mach);
                push @scr, '-S', $screen_session if defined $screen_session;
@@ -109,6 +116,22 @@ my $nrows;
 my @by_row = ();
 my %by_host = ();
 
+my $total_hosts;
+my $active_hosts;
+my $done_hosts;
+my $failed_hosts;
+
+my $total_jobs;
+my $active_jobs;
+my $done_jobs;
+my $failed_jobs;
+
+my %host_active_jobs;
+my %host_done_jobs;
+my %host_failed_jobs;
+my %host_last_fail_job;
+my %host_last_fail_stat;
+
 sub new($) {
        $C = new Curses;
        start_color;
@@ -119,18 +142,55 @@ sub new($) {
        $C->meta(1);
        $C->clear;
        init_pair(1, COLOR_YELLOW, COLOR_BLUE);
+       init_pair(2, COLOR_YELLOW, COLOR_RED);
+       init_pair(3, COLOR_YELLOW, COLOR_BLACK);
+       init_pair(4, COLOR_RED, COLOR_BLACK);
+
        $nrows = $C->getmaxy - 2;
        if ($BEX::Config::max_parallel_jobs > $nrows) {
                $BEX::Config::max_parallel_jobs = $nrows;
        }
-       return bless {};
+
+       $total_hosts = $active_hosts = $done_hosts = $failed_hosts = 0;
+       $total_jobs = $active_jobs = $done_jobs = $failed_jobs = 0;
+       %host_active_jobs = %host_done_jobs = %host_failed_jobs = %host_last_fail_job = %host_last_fail_stat = ();
+       for my $m (@machines) {
+               $total_hosts++;
+               $total_jobs += $job_counter{$m};
+               $host_active_jobs{$m} = $host_done_jobs{$m} = $host_failed_jobs{$m} = 0;
+       }
+
+       my $ui = bless {};
+       $ui->refresh_status;
+       return $ui;
 }
 
 sub done($)
 {
+       $C->bkgdset(COLOR_PAIR(1) | A_BOLD);
+       $C->addstr($C->getmaxy-1, 0, "Press any key to quit...");
+       $C->clrtoeol;
+       $C->getch;
        endwin;
 }
 
+sub err($$) {
+       my ($ui, $msg) = @_;
+       $C->bkgdset(COLOR_PAIR(2) | A_BOLD);
+       $C->addnstr($C->getmaxy-1, 0, "ERROR: $msg", $C->getmaxx);
+       $C->clrtoeol;
+       $C->refresh;
+}
+
+sub refresh_status($) {
+       $C->bkgdset(COLOR_PAIR(1) | A_BOLD);
+       my $waiting_hosts = $total_hosts - $active_hosts - $done_hosts - $failed_hosts;
+       my $waiting_jobs = $total_jobs - $active_jobs - $done_jobs - $failed_jobs;
+       $C->addnstr(0, 0, "BEX  Hosts: ${active_hosts}R ${done_hosts}D ${failed_hosts}E ${waiting_hosts}W  Jobs: ${active_jobs}R ${done_jobs}D ${failed_jobs}E ${waiting_jobs}W", $C->getmaxx);
+       $C->clrtoeol;
+       $C->refresh;
+}
+
 sub get_slot($) {
        my ($mach) = @_;
        my $s;
@@ -172,13 +232,39 @@ sub redraw_slot($) {
        my $stat = $s->{'Status'} // "?";
        my $jid = $s->{'Job'} // "";
        my $jname = ($jid eq "" ? "" : $queue->job_name($jid));
-       my $text = sprintf("%-20s %-10s %s", $mach, $stat, $jname);
-       if ($stat eq 'DONE') {
+       if ($host_active_jobs{$mach}) {
+               if ($host_failed_jobs{$mach}) {
+                       $C->bkgdset(COLOR_PAIR(4) | A_BOLD);
+               } else {
+                       $C->bkgdset(COLOR_PAIR(3) | A_BOLD);
+               }
+       } else {
+               if ($host_failed_jobs{$mach}) {
+                       $C->bkgdset(COLOR_PAIR(4));
+               } else {
+                       $C->bkgdset(0);
+               }
+       }
+       my $r = $s->{'Row'} + 1;
+       $C->addstr($r, 0, sprintf("%-20.20s", $mach));
+       if ($host_failed_jobs{$mach}) {
+               $C->bkgdset(COLOR_PAIR(4));
+               $C->addstr(sprintf("%3dE ", $host_failed_jobs{$mach}));
+       } else {
                $C->bkgdset(0);
+               $C->addstr("     ");
+       }
+       $C->bkgdset(0);
+       $C->addstr(sprintf("%3dD %3dW", $host_done_jobs{$mach}, $job_counter{$mach} - $host_done_jobs{$mach} - $host_failed_jobs{$mach}));
+       if ($stat eq 'DONE') {
+               if (defined $host_last_fail_stat{$mach}) {
+                       $C->bkgdset(COLOR_PAIR(4));
+                       $C->addstr(sprintf("  %-8s %s", $host_last_fail_stat{$mach}, $host_last_fail_job{$mach}));
+               }
        } else {
-               $C->bkgdset(COLOR_PAIR(1) | A_BOLD);
+               my $text = sprintf("  %-8s %s", $stat, $jname);
+               $C->addstr($text);
        }
-       $C->addnstr($s->{'Row'}, 0, $text, $C->getmaxx);
        $C->clrtoeol;
        $C->refresh;
 }
@@ -186,8 +272,59 @@ sub redraw_slot($) {
 sub update($$$$) {
        my ($ui, $mach, $jid, $stat) = @_;
        my $s = get_slot($mach);
+       given ($stat) {
+               when ('OK') {
+                       $active_jobs--;
+                       $done_jobs++;
+                       $host_active_jobs{$mach}--;
+                       $host_done_jobs{$mach}++;
+               }
+               when (['FAILED', 'INTERR', 'NOPING', 'PREPFAIL']) {
+                       $active_jobs--;
+                       $failed_jobs++;
+                       $host_active_jobs{$mach}--;
+                       $host_failed_jobs{$mach}++;
+                       $host_last_fail_job{$mach} = $jid;
+                       $host_last_fail_stat{$mach} = $stat;
+               }
+               when ('DONE') {
+                       $active_hosts--;
+                       if ($host_failed_jobs{$mach}) {
+                               $failed_hosts++;
+                       } else {
+                               $done_hosts++;
+                       }
+               }
+               when ('INIT') {
+                       if (defined $jid) {
+                               $active_hosts++;
+                       } else {
+                               $active_jobs++;
+                               $host_active_jobs{$mach}++;
+                       }
+               }
+               when ('LOCKED') {
+                       if ($jid eq '-') {
+                               $failed_jobs += $job_counter{$mach};
+                               $host_failed_jobs{$mach} += $job_counter{$mach};
+                       } else {
+                               $active_jobs--;
+                               $failed_jobs++;
+                               $host_active_jobs{$mach}--;
+                               $host_failed_jobs{$mach}++;
+                               $host_last_fail_job{$mach} = $jid;
+                               $host_last_fail_stat{$mach} = $stat;
+                       }
+               }
+               when (['START', 'PING', 'SEND', 'RUN']) {
+               }
+               default {
+                       $ui->err("Received unknown job status $stat");
+               }
+       }
        $s->{'Job'} = $jid;
        $s->{'Status'} = $stat;
        redraw_slot($s);
        if ($stat eq 'DONE') { delete_slot($s); }
+       $ui->refresh_status;
 }
diff --git a/brun b/brun
index 3769b31eaf886d9a01cbd58e524c4c35eb897f54..040b6ba40d66d6abc9f83bd8b00401062a76e069 100755 (executable)
--- a/brun
+++ b/brun
@@ -167,6 +167,7 @@ for my $mach (@machines) {
                        $jid eq $given_job or next;
                }
                my $job = BEX::Job->new_from_file($queue->job_file($jid));
+               update_status($mach, $jid, 'INIT', undef);
                if (!$queue->lock($mach, $jid)) {
                        print "### Skipping locked $jid on $mach ###\n";
                        update_status($mach, $jid, 'LOCKED', undef);