From: Martin Mares Date: Mon, 31 Oct 2011 22:31:29 +0000 (+0100) Subject: bprun: Various cursed updates X-Git-Tag: v3.0~28 X-Git-Url: http://mj.ucw.cz/gitweb/?a=commitdiff_plain;h=7f1ddb0d45fa5fcd2d71107c3a28b5a8c3bcae10;p=bex.git bprun: Various cursed updates --- diff --git a/NOTES b/NOTES index 7b40ffd..28adf19 100644 --- a/NOTES +++ b/NOTES @@ -67,7 +67,7 @@ REQUEUE Attempted to put on a queue, but it already was there Additional status codes sent only over status FIFO: DONE Done with the host (job equals "-") -INIT Host ready, preparing to execute jobs (job equals "-") +INIT Host or job ready, preparing to execute jobs LOCKED Host or job not available, because it is locked by another brun PING Trying to ping the host (job equals "-") SEND Sending job to the host diff --git a/bprun b/bprun index 966111a..0a92533 100755 --- a/bprun +++ b/bprun @@ -4,6 +4,8 @@ use strict; use warnings; +use feature 'switch'; + use Getopt::Long; use POSIX; @@ -31,8 +33,15 @@ AMEN system 'screen', '-S', $screen_session, '-X', 'select', '.'; !$? or die "Screen session $screen_session not found\n"; -my @machines = BEX::Config::parse_machine_list(@ARGV ? @ARGV : '*'); my $queue = BEX::Queue->new($queue_name); +my @machines = (); +my %job_counter = (); +for my $m (BEX::Config::parse_machine_list(@ARGV ? @ARGV : '*')) { + my @jobs = $queue->scan($m); + @jobs or next; + push @machines, $m; + $job_counter{$m} = @jobs; +} my $fifo_name = $queue->{'Name'} . '/status-fifo'; unlink $fifo_name; @@ -47,8 +56,6 @@ my $max = $BEX::Config::max_parallel_jobs; while (keys %running || @machines) { if (@machines && keys %running < $max) { my $mach = shift @machines; - my @jobs = $queue->scan($mach); - @jobs or next; $ui->update($mach, undef, 'START'); my @scr = ('screen', '-t', $mach); push @scr, '-S', $screen_session if defined $screen_session; @@ -109,6 +116,22 @@ my $nrows; my @by_row = (); my %by_host = (); +my $total_hosts; +my $active_hosts; +my $done_hosts; +my $failed_hosts; + +my $total_jobs; +my $active_jobs; +my $done_jobs; +my $failed_jobs; + +my %host_active_jobs; +my %host_done_jobs; +my %host_failed_jobs; +my %host_last_fail_job; +my %host_last_fail_stat; + sub new($) { $C = new Curses; start_color; @@ -119,18 +142,55 @@ sub new($) { $C->meta(1); $C->clear; init_pair(1, COLOR_YELLOW, COLOR_BLUE); + init_pair(2, COLOR_YELLOW, COLOR_RED); + init_pair(3, COLOR_YELLOW, COLOR_BLACK); + init_pair(4, COLOR_RED, COLOR_BLACK); + $nrows = $C->getmaxy - 2; if ($BEX::Config::max_parallel_jobs > $nrows) { $BEX::Config::max_parallel_jobs = $nrows; } - return bless {}; + + $total_hosts = $active_hosts = $done_hosts = $failed_hosts = 0; + $total_jobs = $active_jobs = $done_jobs = $failed_jobs = 0; + %host_active_jobs = %host_done_jobs = %host_failed_jobs = %host_last_fail_job = %host_last_fail_stat = (); + for my $m (@machines) { + $total_hosts++; + $total_jobs += $job_counter{$m}; + $host_active_jobs{$m} = $host_done_jobs{$m} = $host_failed_jobs{$m} = 0; + } + + my $ui = bless {}; + $ui->refresh_status; + return $ui; } sub done($) { + $C->bkgdset(COLOR_PAIR(1) | A_BOLD); + $C->addstr($C->getmaxy-1, 0, "Press any key to quit..."); + $C->clrtoeol; + $C->getch; endwin; } +sub err($$) { + my ($ui, $msg) = @_; + $C->bkgdset(COLOR_PAIR(2) | A_BOLD); + $C->addnstr($C->getmaxy-1, 0, "ERROR: $msg", $C->getmaxx); + $C->clrtoeol; + $C->refresh; +} + +sub refresh_status($) { + $C->bkgdset(COLOR_PAIR(1) | A_BOLD); + my $waiting_hosts = $total_hosts - $active_hosts - $done_hosts - $failed_hosts; + my $waiting_jobs = $total_jobs - $active_jobs - $done_jobs - $failed_jobs; + $C->addnstr(0, 0, "BEX Hosts: ${active_hosts}R ${done_hosts}D ${failed_hosts}E ${waiting_hosts}W Jobs: ${active_jobs}R ${done_jobs}D ${failed_jobs}E ${waiting_jobs}W", $C->getmaxx); + $C->clrtoeol; + $C->refresh; +} + sub get_slot($) { my ($mach) = @_; my $s; @@ -172,13 +232,39 @@ sub redraw_slot($) { my $stat = $s->{'Status'} // "?"; my $jid = $s->{'Job'} // ""; my $jname = ($jid eq "" ? "" : $queue->job_name($jid)); - my $text = sprintf("%-20s %-10s %s", $mach, $stat, $jname); - if ($stat eq 'DONE') { + if ($host_active_jobs{$mach}) { + if ($host_failed_jobs{$mach}) { + $C->bkgdset(COLOR_PAIR(4) | A_BOLD); + } else { + $C->bkgdset(COLOR_PAIR(3) | A_BOLD); + } + } else { + if ($host_failed_jobs{$mach}) { + $C->bkgdset(COLOR_PAIR(4)); + } else { + $C->bkgdset(0); + } + } + my $r = $s->{'Row'} + 1; + $C->addstr($r, 0, sprintf("%-20.20s", $mach)); + if ($host_failed_jobs{$mach}) { + $C->bkgdset(COLOR_PAIR(4)); + $C->addstr(sprintf("%3dE ", $host_failed_jobs{$mach})); + } else { $C->bkgdset(0); + $C->addstr(" "); + } + $C->bkgdset(0); + $C->addstr(sprintf("%3dD %3dW", $host_done_jobs{$mach}, $job_counter{$mach} - $host_done_jobs{$mach} - $host_failed_jobs{$mach})); + if ($stat eq 'DONE') { + if (defined $host_last_fail_stat{$mach}) { + $C->bkgdset(COLOR_PAIR(4)); + $C->addstr(sprintf(" %-8s %s", $host_last_fail_stat{$mach}, $host_last_fail_job{$mach})); + } } else { - $C->bkgdset(COLOR_PAIR(1) | A_BOLD); + my $text = sprintf(" %-8s %s", $stat, $jname); + $C->addstr($text); } - $C->addnstr($s->{'Row'}, 0, $text, $C->getmaxx); $C->clrtoeol; $C->refresh; } @@ -186,8 +272,59 @@ sub redraw_slot($) { sub update($$$$) { my ($ui, $mach, $jid, $stat) = @_; my $s = get_slot($mach); + given ($stat) { + when ('OK') { + $active_jobs--; + $done_jobs++; + $host_active_jobs{$mach}--; + $host_done_jobs{$mach}++; + } + when (['FAILED', 'INTERR', 'NOPING', 'PREPFAIL']) { + $active_jobs--; + $failed_jobs++; + $host_active_jobs{$mach}--; + $host_failed_jobs{$mach}++; + $host_last_fail_job{$mach} = $jid; + $host_last_fail_stat{$mach} = $stat; + } + when ('DONE') { + $active_hosts--; + if ($host_failed_jobs{$mach}) { + $failed_hosts++; + } else { + $done_hosts++; + } + } + when ('INIT') { + if (defined $jid) { + $active_hosts++; + } else { + $active_jobs++; + $host_active_jobs{$mach}++; + } + } + when ('LOCKED') { + if ($jid eq '-') { + $failed_jobs += $job_counter{$mach}; + $host_failed_jobs{$mach} += $job_counter{$mach}; + } else { + $active_jobs--; + $failed_jobs++; + $host_active_jobs{$mach}--; + $host_failed_jobs{$mach}++; + $host_last_fail_job{$mach} = $jid; + $host_last_fail_stat{$mach} = $stat; + } + } + when (['START', 'PING', 'SEND', 'RUN']) { + } + default { + $ui->err("Received unknown job status $stat"); + } + } $s->{'Job'} = $jid; $s->{'Status'} = $stat; redraw_slot($s); if ($stat eq 'DONE') { delete_slot($s); } + $ui->refresh_status; } diff --git a/brun b/brun index 3769b31..040b6ba 100755 --- a/brun +++ b/brun @@ -167,6 +167,7 @@ for my $mach (@machines) { $jid eq $given_job or next; } my $job = BEX::Job->new_from_file($queue->job_file($jid)); + update_status($mach, $jid, 'INIT', undef); if (!$queue->lock($mach, $jid)) { print "### Skipping locked $jid on $mach ###\n"; update_status($mach, $jid, 'LOCKED', undef);