#!/usr/bin/env perl

use warnings;
use strict;

use Getopt::Long;
use Cwd;

use lib "/cluster/bin/scripts";
use Encode;
use RAFile;
use HgAutomate;
use HgDb;

# maps metadata terms to the terms used in cv.ra
my %expVarCVMapper = (
    "cell" => "Cell Line",
    "antibody" => "Antibody",
    "treatment" => "treatment",
    "control" => "control",
    "obtainedBy" => "lab"
);

# for each metadata term, what fields do we want to output
my %expVarDetails = (
    "cell"      => ["organism", "description", "karyotype", "lineage", "sex"],
    "antibody"  => ["antibodyDescription", "targetDescription", "vendorName"],
    "treatment" => ["description"],
    "control"   => ["description"],
);

# translates cv.ra fields into english labels
my %detailsCleaner = (
    "antibodyDescription" => "description",
    "targetDescription"   => "target description",
    "vendorName"          => "vendor name",
);

# takes dirty metadata and outputs normalized form
my %dataTypeCleaner = (
    "Cage" 	=> "Cage",
    "ChIPseq"   => "ChIP-seq",
    "ChIPSeq"   => "ChIP-seq",
    "Chipseq"   => "ChIP-seq",
    "ChipSeq"   => "ChIP-seq",
    "DnaPet"	=> "DnaPet",
    "DnaseDgf"  => "DNaseDgf",
    "DNaseSeq"  => "DNase-seq",
    "DnaseSeq"  => "DNase-seq",
    "FAIREseq"  => "FAIRE-seq",
    "MethylRrbs" => "MethylRrbs",
    "MethylSeq"	=> "MethylSeq",
    "Orchid"	=> "Orchid",
    "Proteogenomics" => "Proteogenomics",
    "RnaPet"	=> "RnaPet",
    "RnaSeq"    => "RNA-seq",
);

# given a normalized datatype (see above), give the molecle involved
my %dataTypeMolecule = (
    "Cage"	=> "Total RNA",
    "ChIP-seq"  => "genomic DNA",
    "DnaPet"	=> "genomic DNA",
    "DnaseDgf"	=> "genomic DNA",
    "DNase-seq" => "genomic DNA",
    "FAIRE-seq" => "genomic DNA",
    "MethylRrbs" => "genomic DNA",
    "MethylSeq"	=> "genomic DNA",
    "Orchid"	=> "genomic DNA",
    "Proteogenomics" => "protein",
    "RnaPet"	=> "total RNA",
    "RNA-seq"   => "polyA RNA",
);

# given a normalized datatype, give rbs library strategy
my %dataTypeLibraryStrategy = (
    "Cage"	=> "OTHER",
    "ChIP-seq"  => "ChIP-Seq",
    "DnaPet"	=> "OTHER",
    "DnaseDgf"	=> "DNase-Hypersensitivity",
    "DNase-seq" => "DNase-Hypersensitivity",
    "FAIRE-seq"	=> "OTHER",
    "MethylRrbs" => "Bisulfite-Seq",
    "MethylSeq"	=> "MRE-Seq",
    "Orchid"	=> "OTHER",
    "Proteogenomics" => "mass spectrometry-based proteogenomic mapping", 
    "RnaPet"	=> "OTHER",
    "RNA-seq"   => "RNA-Seq",
);

# given a normalized datatype, give the library source
my %dataTypeLibrarySource = (
    "Cage"	=> "transcriptomic",
    "ChIP-seq"  => "genomic",
    "DnaPet"	=> "genomic",
    "DnaseDgf"	=> "genomic",
    "DNase-seq" => "genomic",
    "FAIRE-seq"	=> "genomic",
    "MethylRrbs" => "genomic",
    "MethylSeq"	=> "genomic",
    "Orchid"	=> "genomic",
    "Proteogenomics" => "protein",
    "RnaPet"	=> "transcriptomic",
    "RNA-seq"   => "transcriptomic",
);

# given a normalized datatype, give the library selection method
my %dataTypeLibrarySelection = (
    "Cage"	=> "CAGE",
    "ChIP-seq"  => "ChIP",
    "DnaPet"	=> "size fractionation",
    "DnaseDgf"	=> "DNase",
    "DNase-seq" => "DNase",
    "FAIRE-seq"	=> "other",
    "MethylRrbs" => "Reduced Representation",
    "MethylSeq"	=> "Restriction Digest",
    "Orchid"	=> "other",
    "Proteogenomics" => "chromatographically fractionated peptides",
    "RnaPet"	=> "other",
    "RNA-seq"	=> "cDNA",
);

use vars qw/
    $opt_configDir
    $opt_verbose
    $opt_instance
    $opt_geoUser
    $opt_soft
    $opt_downloadDir
    $opt_release
    $opt_database
    /;

sub usage {
    print STDERR <<END;
usage: encodeMkGeoPkg database composite seqPlatform
    -soft               Only create the SOFT file
    -instance=s         Use ENCODE pipeline instance s (default prod).
    -configDir=dir      Path of configuration directory, containing metadata
                        .ra files (default: /hive/groups/encode/dcc/pipeline/encpipeline_prod/config/)
    -database=s		Database on which the metadata will be checked (default: mysqlbeta)
    -downloadDir=dir    Prefix to the downloads directory (default: /hive/groups/encode/dcc/analysis/ftp/pipeline/)
    -release=s		Name of release of composite to be checked (default: "", uses download folder)
    -verbose=num        Set verbose level to num (default 1).
END
exit 1;
}

############################################################################
# Function

sub getObjMetadata {
    my ($db, $obj, $mysql) = @_;
    my $results = $db->execute("SELECT var, val FROM metaDb WHERE obj = '$obj'");
    if ($mysql eq "mysqlbeta"){ 
        my $results = $db->execute("SELECT var, val FROM metaDb_public WHERE obj = '$obj'");
    }
    my %metadata = ();
    if($results) {
        while(my @row = $results->fetchrow_array()) {
            my $key = $row[0];
            my $value = $row[1];
            $metadata{$key} =  $value;
        }
    }

    return %metadata;
}

sub getObjsMatching {
    my ($db, $var, $val, $mysql) = @_;
    # get the list of objects matching var=val
    my $results = $db->execute("SELECT obj FROM metaDb WHERE var = \"$var\" and val = \"$val\"");
    if ($mysql eq "mysqlbeta"){ 
        my $results = $db->execute("SELECT obj FROM metaDb_public WHERE var = \"$var\" and val = \"$val\"");
    }
    my @objects = ();
    if($results) {
        while(my @row = $results->fetchrow_array()) {
            my $obj = $row[0];
            push @objects, $obj;
        }
    }

    return @objects;
}

sub getSameMetadata {
    my $field = shift;
    my @metadataList = @_;

    my %metadata;
    %metadata = %{$metadataList[0]};
    my $firstValue = $metadata{$field};
    for (my $i = 1; $i < scalar(@metadataList); ++$i) {
        %metadata = %{$metadataList[$i]};
        die "$firstValue != $metadata{$field}" unless $firstValue eq $metadata{$field};
    }

    return $firstValue;
}

sub cleanValue($\%) {
    my $term = shift;
    my %hash = %{$_[0]};
    if (defined $hash{$term}) {
        return $hash{$term};
    } else {
        return $term;
    }
}

sub mapMdbToCv {
    my $term = shift;
    return cleanValue($term, %expVarCVMapper);
}

sub isRaw {
    my $type = shift;
    return (
            $type eq "Fasta3p" or
            $type eq "Fasta5p" or
            $type eq "FastaRd1" or
            $type eq "FastaRd2" or
            $type eq "Fastq" or
            $type eq "FastqRd1" or
            $type eq "FastqRd2" or
            $type eq "RawData"
	    );
}

sub readMd5 {
    my $checksum_filename = shift;
    open(my $fh, "<", $checksum_filename) or die "can't open checksum file $checksum_filename";
    my %checksums = ();
    while(<$fh>) {
        chomp;
        my $checksum;
        my $filename;
        ($checksum, $filename) = split /  /, $_, 2;
        $checksums{$filename} = $checksum;
    }
    
    return %checksums;
}

############################################################################
# Main

my $now = time();
my $wd = cwd();
my ($release, $mySqlDatabase);
my $ok = GetOptions("instance=s",
                    "configDir=s",
                    "verbose=i",
                    "geoUser=s",
                    "downloadDir=s",
                    "release:s" => \$release,
                    "database:s" => \$mySqlDatabase,
                    "soft"
                    );
# parse options
usage() if (!$ok);
usage() if (scalar(@ARGV) < 3);
# get options or set defaults
if (not defined $opt_instance) {
    $opt_instance = "prod";
}
if (not defined $opt_geoUser) {
    $opt_geoUser = "ucsc_encode_dcc";
}
if (not defined $opt_downloadDir) {
    $opt_downloadDir = "/hive/groups/encode/dcc/analysis/ftp/pipeline/";
}
if (not defined $release){
    $release = "";
}

if (not defined $mySqlDatabase) {
    $mySqlDatabase = "mysqlbeta";
}
my $configPath;
if (defined $opt_configDir) {
    if ($opt_configDir =~ /^\//) {
        $configPath = $opt_configDir;
    } else {
        $configPath = "$wd/$opt_configDir";
    }
} else {
    $configPath = "/hive/groups/encode/dcc/pipeline/encpipeline_$opt_instance/config/";
}
if(!(-d $configPath)) {
    die "configPath '$configPath' is invalid; Can't find the config directory\n";
}
if (not defined $opt_verbose) {
    $opt_verbose = 1;
}
HgAutomate::verbose(4, "Config directory path: \'$configPath\'\n");

my $database  = $ARGV[0];
my $compositeName = $ARGV[1];
my $instrument = $ARGV[2];

# some counters we use
my $i;
my $j;
my $c;
my $f;
my $o;

# get the project dir
my $compositeDir = "$opt_downloadDir$database/$compositeName";
if (defined $release){
    $compositeDir = "$compositeDir/$release";
}

# read the cv.ra file
my %cvTerms = Encode::getControlledVocab($configPath);

# read the checksum file
my %checksums = readMd5($compositeDir . "/md5sum.txt");

# connect to the database and read the metadata table for the obj
my $dbHandle = HgDb->new(DB => $database, HOST => $mySqlDatabase);

# read the experimental variables from expVars.ra
my @expVars = Encode::getExpVars($configPath, $compositeName);
my $numDim = scalar(@expVars);

# get the list of objects of this composite and store it in a set (hash)
my %objsSet = map { $_ => 0 } getObjsMatching($dbHandle, "composite", $compositeName, $mySqlDatabase);
my @objects = keys %objsSet;

print STDERR "Number of objects in composite track: ", scalar(keys %objsSet), "\n";
print STDERR "Number of experimental variables: ", scalar(@expVars), "\n";
for my $i (@expVars) {
    print STDERR "\t", $i, "\n";
}

# set up the package directory, unless called with -soft
my $packageName = $opt_geoUser . "_" . $compositeName;
if (not defined $opt_soft) {
    # check to see if the package name already exists
    if (-e $packageName) {
        print STDERR "Cannot make package: $packageName already exists in the current directory\n\tremove it and try again.\n";
        exit(10);
    }
    system("mkdir $packageName");
}

open MANIFEST, ">$packageName.soft";

# the list of sample ids
my @sampleIds;

# while there are objects left
while(scalar(keys %objsSet) > 0) {
    # get the 1st object
    my $firstObject = (keys(%objsSet))[0];
    my @currentObjects = ($firstObject);
    delete $objsSet{$firstObject};

    my %metadata;
    # get the experimental variables for that object

    %metadata = getObjMetadata($dbHandle, $firstObject, $mySqlDatabase);

    if ($metadata{'objType'} eq "composite"){
	my $firstObject = (keys(%objsSet))[0];
	my @currentObjects = ($firstObject);
	delete $objsSet{$firstObject};
    }

    my @currentMetadata = {%metadata};
    my @currentExpVars;
    for my $i (@expVars) {
        defined $metadata{$i} or die "can't find field for experimental variable " . $i . " located in $firstObject\n";
        push @currentExpVars, $metadata{$i};
    }

    # get other objs with the same metadata
    for $o (keys %objsSet) {

        %metadata = getObjMetadata($dbHandle, $o, $mySqlDatabase);
        my $match = 1;
        for($i = 0; $i < scalar(@expVars); ++$i) {

            #Tests to see if it's the stub object at top of file
	    if ($metadata{'objType'} eq "composite"){
		$match = 0;
		delete $objsSet{$o};
	    }else{

                defined $metadata{$expVars[$i]} or die "can't find field for experimental variable '" . $expVars[$i] . "' in obj " . $o;
                if ($metadata{$expVars[$i]} ne $currentExpVars[$i]) {
                    $match = 0;
                }
            }
        }
        if ($match) {
            push @currentObjects, $o;
            push @currentMetadata, {%metadata};
            delete $objsSet{$o};
        }
    }

    my $grant = getSameMetadata("grant", @currentMetadata);
    my $sampleIdentifier = $grant . "_" . join "_", @currentExpVars;
    my $sampleTitle = $sampleIdentifier;

    push @sampleIds, $sampleIdentifier;

    # read the organism for this cell line
    my $cell = getSameMetadata("cell", @currentMetadata);
    my %cellLines = %{$cvTerms{"Cell Line"}};
    my %cellLineInfo = %{$cellLines{$cell}};
    my $organism = $cellLineInfo{"organism"};
    my $lowerOrganism = $organism;
    $lowerOrganism=~ tr/[A-Z]/[a-z]/;
    my $provider = $cellLineInfo{"vendorName"};
    if (not $provider) {
        print STDERR "WARNING: No biomaterial provider given for $cell\n"
    }
    
    #$cellLineInfo{"protocol"} or die "No growth protocol (in \"protocol\") defined for cell $cell\n";
    if (not defined $cellLineInfo{"protocol"}) {
        print $cell, "\n";
        $cellLineInfo{"protocol"} = "NONE";
    }
    my $growthProtocolUrl = "http://genome.ucsc.edu/ENCODE/protocols/cells/$lowerOrganism/" . $cellLineInfo{"protocol"};
    my $growthProtocol = "Cells were grown according to ENCODE cell culture protocols: $growthProtocolUrl";

    my $extractProtocolUrl = "http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=$database&g=$compositeName";
    my $extractProtocol    = "For extraction protocol details see: $extractProtocolUrl";

    my $dataProcessingUrl = "http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=$database&g=$compositeName";
    my $dataProcessing    = "For data processing details see: $dataProcessingUrl";

    my $dataType = $dataTypeCleaner{getSameMetadata("dataType", @currentMetadata)} or die "cannot handle data type " . getSameMetadata("dataType", @currentMetadata);

    my $molecule = $dataTypeMolecule{$dataType} or die "could not find molecule for data type $dataType";

    my $libraryStrategy = $dataTypeLibraryStrategy{$dataType};
    my $ibrarySource    = $dataTypeLibrarySource{$dataType};
    my $ibrarySelection = $dataTypeLibrarySelection{$dataType};

    # generate the SOFT
    print MANIFEST "^SAMPLE=$sampleIdentifier" . "\n";
    print MANIFEST "!Sample_title = $sampleTitle" . "\n";
    print MANIFEST "!Sample_source_name = $cell" . "\n";
    print MANIFEST "!Sample_organism = $organism" . "\n";
    # iterate over exp vars
    for ($i = 0; $i < scalar(@expVars); ++$i) {
        my $e = getSameMetadata($expVars[$i], @currentMetadata);
        # output the base exp var value
        print MANIFEST "!Sample_characteristics = $expVars[$i]: $e" . "\n";
        # if the currnet exp var is none
        if (not defined $currentExpVars[$i] or
            $currentExpVars[$i] eq "None" or
            $currentExpVars[$i] eq "none" or
            ($expVars[$i] eq "antibody" and $currentExpVars[$i] eq "Input") or
            ($expVars[$i] eq "antibody" and $currentExpVars[$i] eq "Control") or
            $expVars[$i] eq "replicate" or $expVars[$i] eq "rank") {
            # do nothing more
        } else {
            # otherwise output the fields given by %expVarDetails from the cv.ra
            my %cvType = %{$cvTerms{mapMdbToCv($expVars[$i])}};
            my %cvData = %{$cvType{$currentExpVars[$i]}};
            for $j (@{$expVarDetails{$expVars[$i]}}) {
                # if there's a better label for this term use it
                if (defined $cvData{$j}) {
                    if (defined $detailsCleaner{$j}) {
                        print MANIFEST "!Sample_characteristics = $expVars[$i] $detailsCleaner{$j}: ",  $cvData{$j}, "\n";
                    } else {
                        print MANIFEST "!Sample_characteristics = $expVars[$i] $j: ", $cvData{$j}, "\n";
                    }
                }
            }
        }
    }
    
    if ($provider) {
        print MANIFEST "!Sample_biomaterial_provider = $provider" . "\n";
    }
    print MANIFEST "!Sample_growth_protocol = $growthProtocol" . "\n";
    print MANIFEST "!Sample_molecule = $molecule" . "\n";
    print MANIFEST "!Sample_extract_protocol = $extractProtocol" . "\n";
    print MANIFEST "!Sample_data_processing = $dataProcessing" . "\n";
    print MANIFEST "!Sample_library_strategy = $libraryStrategy" . "\n";
    print MANIFEST "!Sample_library_source = $ibrarySource" . "\n";
    print MANIFEST "!Sample_library_selection = $ibrarySelection" . "\n";
    print MANIFEST "!Sample_instrument_model = $instrument" . "\n";
    my $rawCount = 1;
    for $i (@currentMetadata) {
        my %metadata = %{$i};
        my $filename = $metadata{"fileName"};
        if (isRaw($metadata{"view"})) {
            if (-e "$compositeDir/$filename") {
                my $checksum;
                if (defined $checksums{$filename}) {
                    $checksum = $checksums{$filename};
                } else {
                    $checksum = "MISSING";
                    print STDERR "WARNING: $filename is missing checksum\n";
                }
                my ($name, $type, $compression) = split(/\./, $filename);
                print MANIFEST "!Sample_raw_file_$rawCount = $filename" . "\n";
                print MANIFEST "!Sample_raw_file_type_$rawCount = $type" . "\n";
                print MANIFEST "!Sample_raw_file_checksum_$rawCount = $checksum" . "\n";
                if (not defined $opt_soft) {
                    system("cp $compositeDir/$filename $packageName/$filename");
                }
                ++$rawCount;
            } else {
                print STDERR "WARNING: file $compositeDir/$filename not found\n";
            }
        }
    }
    my $supplementCount = 1;
    for $i (@currentMetadata) {
        my %metadata = %{$i};
        my $filename = $metadata{"fileName"};
        if (defined $filename) {
            if (not isRaw($metadata{"view"})) {
                if (-e "$compositeDir/$filename") {
                    my $checksum;
                    if (defined $checksums{$filename}) {
                        $checksum = $checksums{$filename};
                    } else {
                        $checksum = "MISSING";
                        print STDERR "WARNING: $filename is missing checksum\n";
                    }
                    print MANIFEST "!Sample_supplementary_file_$supplementCount = $filename" . "\n";
                    print MANIFEST "!Sample_supplementary_file_checksum_$supplementCount = $checksum" . "\n";
                    print MANIFEST "!Sample_supplementary_file_build_$supplementCount = $database" . "\n";
                    if (not defined $opt_soft) {
                        system("cp $compositeDir/$filename $packageName/$filename");
                    }
                    ++$supplementCount;
                } else {
                    print STDERR "WARNING: file $compositeDir/$filename not found\n";
                }
            }
        } else {
            print "File was generated by DCC\n"
        }
    }
    if ($rawCount == 1 and $supplementCount == 1) {
        print STDERR "WARNING: no files found for $sampleIdentifier\n";
    } elsif ($rawCount == 1) {
        print STDERR "WARNING: no raw files found for $sampleIdentifier\n";
    }
}

my $webLink = "http://genome.ucsc.edu/cgi-bin/hgTrackUi?db=$database&g=$compositeName";

print MANIFEST "^SERIES=$compositeName" . "\n";
print MANIFEST "!Series_title = [required]" . "\n";
print MANIFEST "!Series_summary = [required]" . "\n";
print MANIFEST "!Series_summary = For data usage terms and conditions, please refer to http://www.genome.gov/27528022  and http://www.genome.gov/Pages/Research/ENCODE/ENCODEDataReleasePolicyFinal2008.pdf" . "\n";
print MANIFEST "!Series_overall_design = [required]" . "\n";
print MANIFEST "!Series_contributor = [optional; 1 per author; use 'firstname,lastname' or 'firstname,middleinitial,lastname]" . "\n";
print MANIFEST "!Series_web_link = $webLink" . "\n";
print MANIFEST "!Series_web_link = http://www.ncbi.nlm.nih.gov/geo/info/ENCODE.html" . "\n";
for $i (@sampleIds) {
    print MANIFEST "!Series_sample_id = $i" . "\n";
}

close MANIFEST;
