#!/usr/bin/perl -w
#
# pgwatchlocks:
# Watch PostgreSQL's activity tables to detect locks which are held for a long
# time.
#
# Copyright (c) 2005 UK Citizens Online Democracy. All rights reserved.
# Email: chris@mysociety.org; WWW: http://www.mysociety.org/
#

my $rcsid = ''; $rcsid .= '$Id: pgwatchlocks,v 1.10 2006-05-26 14:38:04 chris Exp $';

use strict;

use DBI;
use POSIX;

my $dbh = DBI->connect('dbi:Pg:dbname=template1', 'postgres', '', { PrintWarn => 1, PrintError => 1, RaiseError => 0, AutoCommit => 1 }) || die DBI->errstr();

# relation_name DB REL
# Given a numeric DB id and REL id, return the text name of the relation.
my %reln_name;
sub relation_name ($$) {
    my ($db, $rel) = @_;
    return $reln_name{"$db.$rel"} if (exists($reln_name{"$db.$rel"}));
    
    my $dbname = $dbh->selectrow_array('select datname from pg_catalog.pg_stat_database where datid = ?', {}, $db);
    $dbname ||= "<db#$db>";
    # We can't get the relation name from the pg_stat_user_tables view, because
    # that will only show the names of relations in the current database.
    # Instead make a new connection to the db name we've just found.
    my $newdbh = DBI->connect("dbi:Pg:dbname=$dbname", 'postgres', '', { PrintWarn => 0, PrintError => 0, RaiseError => 0 }) || warn "connecting to '$dbname' to get name of reln $rel: " . DBI->errstr() . "\n";
    my $relname = $newdbh->selectrow_array('select relname from pg_catalog.pg_stat_user_tables where relid = ?', {}, $rel);
    $relname ||= "<rel#$rel>";
    $newdbh->disconnect();
    return $reln_name{"$db.$rel"} = "$dbname.$relname";
}

# describe_interval N
# Describe an N-second interval in readable form..
sub describe_interval ($) {
    my $n = shift;
    if ($n < 90) {
        return sprintf('%ds', $n);
    } elsif ($n < 5400) {
        return sprintf('%dm %ds', int($n / 60), $n % 60);
    } else {
        return sprintf('%dh %dm %ds', int($n / 3600), int(($n % 3600) / 60), $n % 60);
    }
}

# current_activity PID
# Return the current activity of the connection with the given backend PID.
sub current_activity ($) {
    my $pid = shift;
    return "??" unless (defined($pid));
    my ($user, $query, $when) = $dbh->selectrow_array('select usename, current_query, extract(epoch from query_start) from pg_catalog.pg_stat_activity where procpid = ?', {}, $pid);
    if (defined($user)) {
        $query ||= '?';
        $query =~ s/\s+/ /gs;   # clean up any \n etc.
        my $what;
        if ($query =~ /^<IDLE>(.*)/) {
            $what = "idle$1";
        } else {
            $what = "executing << $query >>";
        }
        $what .= " for " . describe_interval(time() - $when)
            if (defined($when));

        return $what;
    } else {
        return "??";
    }
}

# pid_of_transaction ID
# Given a transaction ID, return the backend PID of the connection executing
# it.
sub pid_of_transaction ($) {
    my $tx = shift;
    # Find the connection which has an exclusive lock on that transaction.
    return scalar($dbh->selectrow_array("select pid from pg_locks where transaction = ? and mode = 'ExclusiveLock'", {}, $tx));
}

# hash of comma-separated pg_locks row to time of start.
my %outstanding_locks;
my %last_report;

#
# Poll pg_locks to discover which connections are waiting for locks. When we
# find one, warn about it, reporting as much detail as we can about what
# process is holding the lock.
# 

while (1) {

    my $r = $dbh->selectall_arrayref("select relation, database, transaction, pid, mode, granted from pg_catalog.pg_locks where not granted");
    
    my %locks;
    foreach my $lock (@$r) {
        my $row = join(',', map { defined($_) ? $_ : '' } @$lock);
        $locks{$row} = time();
    }

    # Difference the two arrays.
    foreach (grep { !exists($locks{$_}) } keys %outstanding_locks) {
        delete($outstanding_locks{$_});
        delete($last_report{$_});
    }

    foreach (grep { !exists($outstanding_locks{$_}) } keys %locks) {
        $outstanding_locks{$_} = $locks{$_};
    }
    
    print STDERR "[H[2J";   # clear screen.
    
    # Now look for processes which have been waiting for locks for too long.
    foreach (keys %outstanding_locks) {
        next unless ($outstanding_locks{$_} < time() - 10);

        my ($last, $n) = (0, 0);
        ($last, $n) = @{$last_report{$_}} if (exists($last_report{$_}));

        next if ($last > time() - 10 * (1.5 ** $n));
        $last_report{$_} = [time(), $n + 1];

        my ($relation, $database, $transaction, $pid, $mode, $granted)
                = map { $_ eq '' ? undef : $_ } split(/,/, $_);

        print POSIX::strftime('%H:%M:%S', localtime()),
                " blocked: PID $pid needs $mode on ";

        if ($relation) {
            print "relation ", relation_name($database, $relation);
        } else {
            print "transaction #$transaction";
        }

        print "; duration ", describe_interval(time() - $outstanding_locks{$_}), "\n";
        print "                  ", current_activity($pid), "\n";

        # Now we need to show the processes it's waiting for, and what they're
        # doing.
        my @pids;
        if ($relation) {
            @pids = @{$dbh->selectall_arrayref('select pid, mode from pg_locks where granted and database = ? and relation = ?', {}, $database, $relation)};
        } else {
            @pids = ([pid_of_transaction($transaction), 'ExclusiveLock']);
        }

        for (my $i = 0; $i < @pids; ++$i) {
            if ($i == 0) {
                print "              by: ";
            } else {
                print "                  ";
            }

            my ($pid, $mode) = @{$pids[$i]};
            print "PID $pid holds $mode\n";
            print "                  ", current_activity($pid), "\n";
        }

        # XXX rationalise all those prints
        # XXX it'd be nice to print details of the *client* processes of the
        # back-ends, where we can. That requires picking over /proc, and
        # perhaps sshing out to other machines -- a pain in the arse,
        # basically.
    }
    
    sleep(1);
}

