#!/usr/bin/perl
#
# mgcDownloadStep [-verbose] [-ftp-verbose]
#
# Download MGC table dumps if new files are available, untar and process
# with mgcImport.  Download is stored in a directory:
#
#   data/download/mgc/$year.$month.$day/
#
# with the date being the modification date for the download mgctables.tar
# file.
#
# This also process data into
#   data/processed/mgc/$year.$month.$day/
#
# Expects a file etc/mgc.conf which is a PERL script setting the
# variables:  $main::mgcFtpHost, $main::mgcFtpUser, $main::mgcFtpPassword
#
# -verbose - print details
# -ftp-verbose - print details of interaction with ftp server
#
# $Id: mgcDownloadStep,v 1.4 2007/02/25 17:17:06 markd Exp $
#
use strict;
use warnings;
use File::Basename;
use POSIX qw(strftime);
use FindBin;
use lib "$FindBin::Bin/../lib";
use Net::FTP;
use gbCommon;
use gbFtp;

# Constants
my $MGC_CONF = "etc/mgc.conf";
my $MGC_TAR = "mgctables.tar";

# list of files expected in downloaded tar file
my @MGC_TABLE_TAR_FILES = ("clone.gz", "fullength.gz", "library.gz", "stage1.gz", "table.descriptions");

# Get the data string based on the modification date of the tables file
# on the ftp servers.
sub getDownloadDate() {
    my $modDate = ftpGetModTime($MGC_TAR);
    my $modDateStr = strftime("%Y.%m.%d", localtime($modDate));
    if ($gbCommon::verbose) {
        prMsg("ftp://$main::mgcFtpHost/$MGC_TAR modified on $modDateStr");
    }
    return $modDateStr;
}

# Download the tar file into a tmpdir under the download dir
sub downloadTables($) {
    my($downloadDir) = @_;
    my $tableTar = "$downloadDir/tmp/$MGC_TAR";
    unlink($tableTar);
    makeFileDir($tableTar);
    ftpGetOrCheck(0, 0, $MGC_TAR, $tableTar);
}

# Extract tar file, verify contents and install them in download dir.
sub extractTar($) {
    my($downloadDir) = @_;
    my $tmpDir = "$downloadDir/tmp";
    my $oldDir = `pwd`;
    chomp($oldDir);
    chdir($tmpDir) || die("can't chdir($tmpDir)");
    if ($gbCommon::verbose) {
        prMsg("extracting MGC table files");
    }

    # FIXME: at one point, absolute paths were included in the tar (which get
    # extract relatively.  Compensate for this by linking back to this dir)
    my $tarOut = callProg("tar -xvf $MGC_TAR");
    chomp($tarOut);
    foreach my $path (split("\n", $tarOut)) {
        if (dirname($path) ne ".") {
            my $file = basename($path);
            $path =~ s/^\///; # drop leading /
            link($path, $file) || die("can't link $path $file");
        }
    }

    # check files exist and can be read if gziped
    foreach my $file (@MGC_TABLE_TAR_FILES) {
        if (! -e $file) {
            die("didn't find file $file in $tmpDir/$MGC_TAR");
        }
        if ($file =~ /\.gz$/) {
            runProg("zcat $file >/dev/null");
        }
    }

    # Move files into place
    my @fileList;
    foreach my $file (@MGC_TABLE_TAR_FILES) {
        renameFile($file, "../$file");
        push(@fileList, "$downloadDir/$file");
    }
    chdir($oldDir) || die("can't chdir($oldDir)");
    removeDir($tmpDir);
    return @fileList;
}

# Do download for the specified data dir, if it's not there.
sub doDownload($) {
    my($downloadDir) = @_;

    my $md5file = "$downloadDir/mgc.md5";
    if (! -e $md5file) {
        if ($gbCommon::verbose) {
            prMsg("donwloading MGC tables in $downloadDir");
        }
        makeDir($downloadDir);
        downloadTables($downloadDir);
        my @mgcFiles = extractTar($downloadDir);
        gbChmod(@mgcFiles);
        md5Files($md5file, @mgcFiles);
    } else {
        if ($gbCommon::verbose) {
            prMsg("MGC tables already in $downloadDir");
        }
    }
}

# Process downloaded tables into status files for the specified directory,
# if it's not already been done.
sub doProcess($) {
    my($downloadDir) = @_;
    my $processedDir = "data/processed/mgc/" . basename($downloadDir);
    my $md5file = "$processedDir/mgc.md5";
    if (! -e $md5file) {
        if ($gbCommon::verbose) {
            prMsg("processing MGC tables in $processedDir");
        }
        makeDir($processedDir);
        my @statusTabs = ("$processedDir/mgcStatus.tab.gz",
                          "$processedDir/mgcFullStatus.tab.gz");
        runProg("mgcImport $downloadDir " . join(" ", @statusTabs));
        gbChmod(@statusTabs);
        md5Files($md5file, @statusTabs);
    } else {
        if ($gbCommon::verbose) {
            prMsg("MGC status files already in $processedDir");
        }
    }
}

# Entry point
while (($#ARGV >= 0) && ($ARGV[0] =~/^-.*/)) {
    my $opt = $ARGV[0];
    shift @ARGV;
    if ($opt eq "-verbose") {
        $gbCommon::verbose = 1;
    } elsif ($opt eq "-ftp-verbose") {
        $gbFtp::verbose = 1;
    } else {
        gbError("invalid option \"$opt\"");
    }
}

if ($#ARGV >= 0) {
    die("Wrong \# args: mgcDownloadStep [-verbose] [-ftp-verbose]");
}

# read ftp info from file
$main::mgcFtpHost = undef;
$main::mgcFtpUser = undef;
$main::mgcFtpPassword = undef;
eval(readFile($MGC_CONF));
if (!(defined($main::mgcFtpHost))) {
    gbError("$MGC_CONF must \$main::mgcFtpHost");
}
if (!(defined($main::mgcFtpUser))) {
    gbError("$MGC_CONF must \$main::mgcFtpUser");
}
if (!(defined($main::mgcFtpPassword))) {
    gbError("$MGC_CONF must \$main::mgcFtpPassword");
}


# use different task dir to allow running parallel with genebank
beginTask("mgcbuild", "download");
ftpInit($main::mgcFtpHost, $main::mgcFtpUser, $main::mgcFtpPassword);
ftpOpen();

# get the download directory from the ftp file mod date and then
# check a checksum file exists, indicating completion.
my $dateStr = getDownloadDate();
my $downloadDir = "data/download/mgc/$dateStr";

doDownload($downloadDir);
ftpClose();

# check if any download dirs need processed
foreach my $downDir (glob("data/download/mgc/*.*.*")) {
    doProcess($downDir);
}
endTask();
