#!/usr/bin/perl

$main::usage = "
gbAlignStep [options] database ...

Options:
  -initial - Default the parameters for the initial alignment of a genome:
     -workdir=work/initial.$db/align
   Doesn't create a semaphone file and will log to 
      var/build/logs/2003.05.23-21:51:12.$db.initalign.log
   Will not migrate alignments from previous releases.

  -workdir=work/align - work directory where alignment is built.
   This directory must have at least two levels (for use with rsync) for
   example: rest/work2/work1.  Specifying a different parent directory
   (work2) for doing alignments without risking of collision with the
   automatic alignments.  The two levels of directories allows rsync to
   delete older files under the parent directory when the lowest-level
   directory includes the alignment date.  Maybe a longer or fully qualified
   path.

  -clusterRootDir=dir - The directory where the files will exist on the
   cluster. The lowest two levels of the workdir are included, resulting, e.g.:
   /dir/work/align.  Overrides cluster.rootDir conf item.

  -paraHub=kk - name of parasol host, must have ssh access.

  -paraPriority=n - parasol priority to use for jobs

  -continue=subtask - continue processing of a failed alignment run
   of this script.   Values for subtask are:
       - run - continue with parasol blat run.
       - finish - finish processing the alignments.
   Continue the blat portion by using parasol directly.
   
  -keep - Keep work directory

  -verbose=n - Set verbose level.

  -srcDb=name - Restrict the source database to either \"genbank\" or \"refseq\".

  -type=name - Restrict the type of sequence processeed to either \"mrna\"
   or \"est\".

  -orgCat=cat - Restrict the organism category of sequences processeed to
   \"native\" or \"xeno\".

  -maxParallel=1 - maximum number of database to align in parallel.

Obtains other parameters from etc/genbank.conf.

Alignment step, runs:
 - gbAlignSetup - Extract sequences to align, create jobs files.
 - gbAlignRun - rsh to kk adn run para make until jobs complete
   checking results.
 - gbAlignFinish - Process PSL files and install in data directory
";

#
# $Id: gbAlignStep,v 1.25 2010/04/11 06:48:30 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use FindBin;
use lib "$FindBin::Bin/../lib";
use gbCommon;
setupServerPath();

# open /cluster/ directories to avoid automounter problems on
# genbank server results in "Device or resource busy" error
# (FIXME: tmp hack)
use POSIX;
sub holdOpenDir($) {
    my($path) = @_;
    my $fd = POSIX::open($path);
    if (!defined($fd)) {
        die("can't open cluster directory $path");
    }
}
sub openNfsDirsHack() {
    holdOpenDir("/cluster/data");
    holdOpenDir("/cluster/genbank");
    holdOpenDir("/cluster/bin");
}

# Entry
my $workDir;
my $verbArg = "";
my $paraHub;
my $keep = 0;
my $clusterRootDir;
my $clusterWorkDir;
my $localWorkDir;
my @srcDbs;
my @types;
my @orgCats;
my $initialBuild = 0;
my $continueTask;
my $verbArgl="";
my $paraPriority;
my $maxParallel = 1;
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt =~ /^-workdir($|=)/) {
        $workDir = parseOptEq($opt);
    } elsif ($opt eq "-initial") {
        $initialBuild = 1;
    } elsif ($opt =~ /^-clusterRootDir($|=)/) {
        $clusterRootDir = parseOptEq($opt);
    } elsif ($opt =~ /^-continue($|=)/) {
        $continueTask = parseOptEq($opt);
    } elsif ($opt eq "-verbose") {
        $gbCommon::verbose = 1;
        $verbArg = "-verbose";
        $verbArgl = "--verbose=1";
    } elsif ($opt =~ /^-verbose=/) {
        $gbCommon::verbose = parseOptEq($opt);
        $verbArg = "-verbose=$gbCommon::verbose";
        $verbArgl = "--verbose=$gbCommon::verbose";
    } elsif ($opt eq "-keep") {
        $keep = 1;
    } elsif ($opt =~ /^-paraHub($|=)/) {
        $paraHub = parseOptEq($opt);
    } elsif ($opt =~ /^-paraPriority($|=)/) {
        $paraPriority = parseOptEq($opt);
    } elsif ($opt =~ /^-srcDb($|=)/) {
        # value check in gbAlignSetup
        my $srcDb = parseOptEq($opt);
        push(@srcDbs, $srcDb);
    } elsif ($opt =~ /^-type($|=)/) {
        # value check in gbAlignSetup
        my $type = parseOptEq($opt);
        push(@types, $type)
    } elsif ($opt =~ /^-orgCat($|=)/) {
        # value check in gbAlignSetup
        my $orgCat = parseOptEq($opt);
        push(@orgCats, $orgCat);
    } elsif ($opt =~ /^-maxParallel($|=)/) {
        $maxParallel = parseOptEq($opt);
    } else {
        gbError("invalid option \"$opt\"");
    }
}

if ($#ARGV < 0) {
    gbError("wrong # args: $main::usage");
}
my @databases;
while ($#ARGV >= 0) {
    push(@databases, $ARGV[0]);
    shift @ARGV;
}

if (!defined($clusterRootDir)) {
    $clusterRootDir = getConf("cluster.rootDir");
}

# defaults for initial alignment
my $initialDb;
if ($initialBuild) {
    if ($#databases != 0) {
        gbError("only one database maybe specified with -initial");
    }
    $initialDb = $databases[0];
    if (!defined($workDir)) {
        $workDir = "work/initial.$initialDb/align";
    }
}

# figure out workDir.
if (!defined($workDir)) {
    $workDir = "work/align";
}
if (!isAbs($workDir)) {
    # make relative to clusterRootDir
    $clusterWorkDir = $clusterRootDir . "/" . $workDir;
    $localWorkDir = $workDir;
} else {
    # absolute work dir, ignore clusterRootDir
    $clusterWorkDir = $workDir;
    $localWorkDir = $workDir;
}
# convert continue task to a list of task to execute
my @tasks;
if (!defined($continueTask)) {
    @tasks = ("setup", "copy", "run", "finish");
} elsif ($continueTask eq "copy") {
    @tasks = ("copy", "run", "finish");
} elsif ($continueTask eq "run") {
    @tasks = ("run", "finish");
} elsif ($continueTask eq "finish") {
    @tasks = ("finish");
} else {
    gbError("invalid value for -continue, expected one of \"copy\", \"run\", or \"finish\"");
}

# If TMPDIR is not set, set it to a local directory. /var/tmp doesn't
# usually have enough space, so we use a tmp dir in the gbRoot
if (!defined($main::ENV{TMPDIR})) {
    $main::ENV{TMPDIR} = "tmp";
    makeDir($main::ENV{TMPDIR});
}

# run gbAlignSetup for a database
sub gbAlignSetup($$$) {
    my($db, $localWorkDirDb, $clusterWorkDirDb) = @_;

    my $cmd = "gbAlignSetup --workdir=$localWorkDirDb $verbArgl";
    if ($initialBuild) {
        $cmd .= " --noMigrate";
    }
    if (defined($clusterRootDir)) {
        $cmd .= " --clusterWorkDir=$clusterWorkDirDb";
    }
    foreach my $srcDb (@srcDbs) {
        $cmd .= " --srcDb=$srcDb";
    }
    foreach my $type (@types) {
        $cmd .= " --type=$type";
    }
    foreach my $orgCat (@orgCats) {
        $cmd .= " --orgCat=$orgCat";
    }
    $cmd .= " $db";
    runProg($cmd);
}

# run gbAlignSetup for a database
sub gbAlignRun($$) {
    my($db, $localWorkDirDb) = @_;
    my $cmd = "gbAlignRun -workdir=$localWorkDirDb $verbArg";
    if (defined($paraHub)) {
        $cmd .= " -paraHub=$paraHub";
    }
    if (defined($paraPriority)) {
        $cmd .= " -paraPriority=$paraPriority";
    }
    runProg($cmd);
}

# run gbAlignFinish for a database
sub gbAlignFinish($$) {
    my($db, $localWorkDirDb) = @_;
    my $cmd = "gbAlignFinish -workdir=$localWorkDirDb $verbArg";
    if ($initialBuild) {
        $cmd .= " -noMigrate";
    }
    $cmd .= " $db";
    runProg($cmd);
}


# function to run alignments for one database.  Designed to allow multi
# database to be run in forked subprocessed.
sub alignForDatabase($) {
    my($db) = @_;
    my $localWorkDirDb = "$localWorkDir/$db";
    my $clusterWorkDirDb = "$clusterWorkDir/$db";

    my $alignNone = "$localWorkDirDb/align.none";  # indicates nothing to align
    my $alignJobs = "$localWorkDirDb/align.jobs";
    my $alignDone = "$localWorkDirDb/align.done";
    makeDir($localWorkDirDb);
    makeDir($clusterWorkDirDb);

    # setup
    if (inList("setup", @tasks)) {
        gbAlignSetup($db, $localWorkDirDb, $clusterWorkDirDb);
        if (-e $alignNone) {
            prMsg("nothing to align");
        }
    }
    # Must either have a jobs or none file
    if (! (-e $alignNone || -e $alignJobs)) {
        gbError("no $alignJobs or $alignNone file found");
    }

    # run alignment
    if (inList("run", @tasks) && !-e $alignNone) {
        gbAlignRun($db, $localWorkDirDb);
    }

    # If -continue=finish is specified, force the create of a jobs.done file
    if (defined($continueTask) && ($continueTask eq "finish")) {
        # finishing by hand, create align.done file.
        open(DONE, ">$alignDone") || die("can't create $alignDone");
        close(DONE) || die("close failed");
    }

    if (inList("finish", @tasks) && !-e $alignNone) {
        gbAlignFinish($db, $localWorkDirDb);
    }
}

# start process to align a database, return pid
sub startAlignForDatabase($) {
    my($db) = @_;
    my $pid = fork();
    if (!defined($pid)) {
        die("can't fork");
    }
    if ($pid == 0) {
        alignForDatabase($db);
        exit(0);
    } else {
        return $pid;
    }
}

# wait for process to align a database, return (pid, boolean status)
sub waitAlignForDatabase() {
    my $pid = waitpid(-1, 0);
    my $stat = $?;
    if ($pid < 0) {
        die("child process lost");
    }
    if (WIFEXITED($stat)) {
        return ($pid, ((WEXITSTATUS($stat) == 0) ? 1 : 0));
  } elsif (WIFSIGNALED($stat)) {
      die("alignForDatabase failed with signal");
  }
}

# run alignments in parallel for databases
sub scheduler() {
    my @pendingDbs = @databases;
    my %pidToDb;
    my $numRunning = 0;
    my $numFailed = 0;

    while ((scalar(@pendingDbs) > 0) || ($numRunning > 0)) {
        # start some
        while ((scalar(@pendingDbs) > 0) && ($numRunning < $maxParallel)) {
            my $db = pop(@pendingDbs);
            my $pid = startAlignForDatabase($db);
            $pidToDb{$pid} = $db;
            $numRunning += 1;
        }
        # wait for one
        my($pid, $status) = waitAlignForDatabase();
        my $db = $pidToDb{$pid};
        $pidToDb{$pid} = undef;
        $numRunning --;
        if ($status) {
            prMsg("Succeeded: $db");
        } else {
            prMsg("FAILED: $db");
            $numFailed += 1;
        }
    }
    if ($numFailed > 0) {
        die("FAILED DATABASES: $numFailed")
    }
}

checkOnBuildServer();
openNfsDirsHack();

# begin
if ($initialBuild) {
    beginTaskNoLock("build", "$initialDb.initalign");
} else {
    beginTask("build", "align");
}

scheduler();

if (!$keep) {
    runProg("rm -rf $localWorkDir");
    my $workParent = dirname($localWorkDir);
    rmdir($workParent);  # don't complain if can't remove
}
endTask();
