#!/usr/freeware/bin/perl require "sys/wait.ph"; select(STDOUT); $| = 1; # unbuffer output @zero = (); @one = (); @medium = (); @large = (); @xlarge = (); $P = how_many_cpus(); $mach = get_machine_name(); print "Running with " . $P . " processors on " . $mach . "\n"; @ready = (0..$P-1); for $i (0..23) { $zero[$i] = 0; $one[$i] = 1; $small[$i] = int($P/4); $medium[$i] = int($P/2); $large[$i] = 3*int($P/4); $xlarge[$i] = $P; } @weekday = (@xlarge[0..7], $large[8], @small[9..16], @small[17..18], @large[19..23]); @weekend = (@xlarge[0..10], @large[11..23]); # Holiday rates # @weekday = (@xlarge[0..8], @large[9..23]); # @weekend = (@xlarge[0..11], @large[12..22], @xlarge[23..23]); print "Weekday " . join(' ', @weekday) . "\n"; print "Weekend " . join(' ', @weekend) . "\n"; %day = ( 'Sun' => [@weekend], 'Mon' => [@weekday], 'Tue' => [@weekday], 'Wed' => [@weekday], 'Thu' => [@weekday], 'Fri' => [@weekday], 'Sat' => [@weekend]); $SIG{'INT'} = 'CLEANUP'; while (1) { undef($did_something); should_we_cool_it(); ($min1, $min5, $min15) = get_la(); ($dayname, $hour) = get_day_and_hour(); $ncpus = $day{$dayname}[$hour]; # if there is too large a load average in the last minute, # start killing jobs rapidly, but don't die on NFS service (+0.15 or less) print scalar(keys(%pids)) . " > " . int($ncpus - $min1 + 0.15) . "\n"; while (scalar(keys(%pids)) > int($ncpus - $min1 + 0.15)) { last if scalar(keys(%pids)) <= 0; print $uptime . "\n"; ($pid, $id) = each(%pids); # print join(' ', sort(values(%pids))) . "\n"; print "Killing " . $id . " (" . $pid . ")\n"; delete $pids{$pid}; kill 'INT', $pid; #print "Waiting " . $pid . "\n"; sleep(2); waitpid($pid, 0); #print "Done " . $flags . "\n"; push(@ready, $id); $did_something = 1; } # see if there are any zombies while (($pid = waitpid(-1, &WNOHANG)) > 0) { print $pid . " (" . $pids{$pid} . "): Found a zombie\n"; push(@ready, $pids{$pid}); delete $pids{$pid}; $did_something = 1; } # if the machine has been a bit idle for the last 15 minutes, # start up some more tasks if (scalar(keys(%pids)) < int($ncpus - $min15) && scalar(keys(%pids)) < int($ncpus - $min1)) { print $uptime . "\n"; startup(shift @ready); $did_something = 1; } if (defined($did_something)) { print "Ideal: " . $ncpus; print " Running " . scalar(keys(%pids)) . ": " . join(' ', values(%pids)) . "\n"; #print "Ready: " . join(' ', @ready) . "\n"; } sleep(15); } sub startup { $id = shift; print "Starting " . $id . "\n"; FORK: { if ($pid = fork) { $pids{$pid} = $id; #print "Child " . $id . "(" . $pid . ")\n"; } elsif (defined $pid) { if ($id < 10) { $dir = "0" . $id; } else { $dir = $id; } chdir $mach . $dir; print join(' ', ('npri', '-w', 'seti'.$dir, '-email')) . "\n"; close(STDIN); close(STDOUT); close(STDERR); exec 'npri', '-w', 'seti'.$dir, '-email'; exit 0; } elsif ($! =~ /No more process/) { # EAGAIN sleep 5; redo FORK; } else { die "Can't fork: $!\n"; } } } sub CLEANUP { kill('INT', keys(%pids)) if scalar(keys(%pids)) > 0; print $$ . " Exiting\n"; sleep 2; exit 0; } sub get_machine_name { open(UNAME, "uname -n|") || die "Can't run uname\n"; $_ = ; m/^(..)/; close(UNAME); return $1; } sub how_many_cpus { open(HINV, "hinv|head -1 |") || return 1; split(' ', $_ = ); $P = shift; close(HINV); return $P; } sub get_day_and_hour { open(DATE, "date |"); chop($_ = ); $date = $_; close(DATE); ($dayname, $month, $daynumber, $time, @_) = split(' ',$date); ($hour, $minute, $sec) = split(':', $time); return ($dayname, $hour); } sub get_la { open(UP, "uptime |"); chop($_ = ); $uptime = $_; close(UP); m/.*load average: ([\d.]+), ([\d.]+), ([\d.]+)/; return ($P/2*$1, $P/2*$2, $P/2*$3); } sub should_we_cool_it { COOL: { if (open(COOL, "/tmp/no_seti")) { print "COOLed off\n"; close(COOL); kill('INT', keys(%pids)) if scalar(keys(%pids)) > 0; while (($pid, $id) = each(%pids)) { waitpid($pid, 0); print "Killed " . $pid . "\n"; delete $pids{$pid}; } sleep(60); redo COOL; } } }