# for emacs: -*- mode: sh; -*-

# Gorilla gorilla
#########################################################################
# DOWNLOAD SEQUENCE (DONE braney 2008-10-07)
    mkdir /hive/data/genomes/gorGor1
    cd /hive/data/genomes/gorGor1
    ln -s `pwd` /cluster/data/gorGor1
    mkdir /hive/data/genomes/gorGor1/sanger
    cd /hive/data/genomes/gorGor1/sanger

    wget --timestamping \
ftp://ftp.sanger.ac.uk/pub4/gorilla/gorilla_draft_genome_31265.agp.gz

    wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/sequences/gorilla/draft_assembly_v0.1/gorilla_draft_genome_31265.fastq.gz
    wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/sequences/gorilla/draft_assembly_v0.1/gorilla_draft_genome_31265.fragments.gz

XXX - working 2008-10-21
    # there are problems with all of these files

    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin gorGor1.qual.qac

   cut -f 1 assembly.agp | uniq -c | wc -l 
   # Number of scaffolds: 116467


#########################################################################
# Create .ra file and run makeGenomeDb.pl(DONE braney 2008-10-07)
    ssh kolossus
    cd /cluster/data/gorGor1
cat << '_EOF_' >gorGor1.config.ra
# Config parameters for makeGenomeDb.pl:
db gorGor1
clade mammal
genomeCladePriority 12
scientificName  Gorilla gorilla gorilla
commonName Gorilla
assemblyDate Oct. 2008
assemblyLabel Sanger Institute genome_31265
orderKey 27
mitoAcc NC_011120
# fastaFiles
# /hive/data/genomes/gorGor1/sanger/gorilla_draft_genome_31265.fragments.gz
fastaFiles /hive/data/genomes/gorGor1/sanger/clean.fragments.fa.gz
# fastaFiles /hive/data/genomes/gorGor1/sanger/genome_31265.fa
# agpFiles /hive/data/genomes/gorGor1/sanger/gorilla_draft_genome_31265.agp.gz
agpFiles /hive/data/genomes/gorGor1/sanger/clean.agp
# qualFiles none
dbDbSpeciesDir gorilla
'_EOF_'
    # << happy emacs

    #	there is some funny business going on with the agp file, it doesn't
    #	specify gaps that are at the beginning of the scaffold.  Our tools
    #	will not work with such an AGP file, so fix it up:
    cd /hive/data/genomes/gorGor1/sanger
    cat << '_EOF_' > fixAgp.pl
#!/usr/bin/env perl

use strict;
use warnings;

open (FH,"zcat gorilla_draft_genome_31265.agp.gz|") or
        die "can not zcat gorilla_draft_genome_31265.agp.gz";
while (my $line = <FH>) {
    my ($name, $start, $end, $frag, $type, $rest) = split('\s+', $line, 6);
    if ((1 == $frag) && (1 != $start)) {
        printf "%s\t1\t%d\t1\tN\t%d\tfragment\tyes\n",
                $name, $start-1, $start-1;
    }
    print "$line";
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x fixAgp.pl
    ./fixAgp.pl > clean.agp

    #	running these step-wise to work out difficulties at each step
    makeGenomeDb.pl -stop=seq -workhorse=hgwdev -verbose=2 gorGor1.config.ra
    makeGenomeDb.pl -continue=agp -stop=agp -workhorse=hgwdev \
	-verbose=2 gorGor1.config.ra
    makeGenomeDb.pl -continue=db -stop=db -workhorse=hgwdev \
	-verbose=2 gorGor1.config.ra


    #	the names are too long for our normal chromInfo index:
LOAD DATA   INFILE '/hive/data/genomes/gorGor1/bed/chromInfo/chromInfo.tab'  INTO TABLE chromInfo

mySQL error 1062: Duplicate entry 'Supercontig_0000' for key 1
    #	Fixup the chromInfo.sql file:
    cd /hive/data/genomes/gorGor1/bed/chromInfo
    cp /cluster/home/hiram/kent/src/hg/lib/chromInfo.sql .
    #	PRIMARY KEY(chrom(20)) - was 16 before
    hgLoadSqlTab gorGor1 chromInfo *.sql *.tab
    #	continuing with the rest of the steps from jkStuff/makeDb.csh,
    #	starting with:
    hgGoldGapGl -noGl gorGor1 /cluster/data/gorGor1/gorGor1.agp

    makeGenomeDb.pl -continue=dbDb -stop=dbDb -workhorse=hgwdev \
	-verbose=2 gorGor1.config.ra
    makeGenomeDb.pl -continue=trackDb -stop=trackDb -workhorse=hgwdev \
	-verbose=2 gorGor1.config.ra


    cut -f 2 chrom.sizes | ave stdin
# Q1 855.000000
# median 1216.000000
# Q3 2773.000000
# average 3810.123774
# min 500.000000
# max 942632.000000
# count 609861
# total 2323645895.000000
# standard deviation 9559.595923

    #	looking for the N50 value:
    awk '{sum += $2; print NR, sum, $0}' chrom.sizes | less
    #	scanning down until the sum is 1/2 of the total
    #	2323645895 / 2 = 1161822947
    #	where it looks like:
# 45571 1161796841 Supercontig_0163790    11279
# 45572 1161808120 Supercontig_0164808    11279
# 45573 1161819399 Supercontig_0249037    11279
# 45574 1161830677 Supercontig_0021829    11278
# 45575 1161841955 Supercontig_0049999    11278
    #	thus N50 is size 11,278 bases at #45,574


#########################################################################
## Repeat Masker (DONE - 2008-11-03,04 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/gorGor1/bed/repeatMasker
    cd /hive/data/genomes/gorGor1/bed/repeatMasker
    time doRepeatMasker.pl -verbose=2 \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` gorGor1 > do.log 2>&1 &
    #	real    360m45.970s
    #	real    797m7.301s
    cat faSize.rmsk.txt
# 2323645895 bases (279671235 N's 2043974660 real 1079526671 upper 964447989
# lower) in 609861 sequences in 1 files
# %41.51 masked total, %47.18 masked real

    #	from the do.log file, to go into the bigZips/README file:
# RepeatMasker version development-$Id: RepeatMasker,v 1.22 2008/08/08
# 16:44:40 angie Exp $
#    Aug 7 2008 (open-3-2-6) version of RepeatMasker
# grep RELEASE /scratch/data/RepeatMasker/Libraries/RepeatMaskerLib.embl
# CC   RELEASE 20080801;   

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-11-03 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/gorGor1/bed/simpleRepeat
    cd /hive/data/genomes/gorGor1/bed/simpleRepeat
    # 
    time doSimpleRepeat.pl -verbose=2 \
	-buildDir=`pwd` gorGor1 > do.log 2>&1 &
    #	real    1471m34.533s
    #	there were delays due to hang-ups when trying to talk to kolossus
    cat fb.simpleRepeat
    #	187515375 bases of 2075548667 (9.034%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/gorGor1
    rm gorGor1.2bit
    twoBitMask gorGor1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed gorGor1.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa gorGor1.2bit stdout | faSize stdin > gorGor1.2bit.faSize.txt
# 2323645895 bases (279671235 N's 2043974660 real 1077776023 upper 966198637
# lower) in 609861 sequences in 1 files
# %41.58 masked total, %47.27 masked real

    #	link to gbdb
    ln -s `pwd`/gorGor1.2bit /gbdb/gorGor1

###########################################################################
# prepare for kluster runs (DONE _ 2008-11-04 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	gorGor1: 2043974660
    # thus: 1024 * 2043974660/2881421696 = 726
    #	rounding up to 800 for a more conservative masking
    cd /hive/data/genomes/gorGor1
    time blat gorGor1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=gorGor1.11.ooc -repMatch=800
    #	Wrote 23284 overused 11-mers to gorGor1.11.ooc
    #	real    2m3.365s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/gorGor1
    cp -p gorGor1.2bit chrom.sizes gorGor1.11.ooc \
	/hive/data/staging/data/gorGor1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-11-03 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Sanger Institute Oct 2008 (NCBI project 31265, CABD01000000)" where name="gorGor1";' hgcentraltest
###########################################################################
#  BLATSERVERS ENTRY (DONE - 2007-11-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor1", "blat13", "17790", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("gorGor1", "blat13", "17791", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence
############################################################################
# Create quality track (DONE - 2008-11-10 - Hiram)
    mkdir /hive/data/genomes/gorGor1/sanger/fastq
    cd /hive/data/genomes/gorGor1/sanger/fastq
    #	the delivered fastq file is full of sequences that have nothing to
    #	do with the Supercontigs, scan it and select out only that business
    #	that belongs, and fixup the chrM sequence name (although this chrM is
    #	different than the one from NC_011120)
    zcat ../gorilla_draft_genome_31265.fastq.gz | awk '
BEGIN { done = 0 }
{
if (match($0,"^@3_42962700_339")) { done = 1; }
if (match($0,"^@gi.5835149.ref.NC_001645.1")) {
    print "@chrM gi.5835149.ref.NC_001645.1 Supercontig_" }
else { if (0 == done) { print } }
}
' | gzip -c > trimmed.fastq.gz
    #	after re-writing fastqToFa to get it to properly parse multi-line
    #	fastq files:
    fastqToFa -qual=gorGor1.qual.fa -nameVerify="Supercontig_" -noErrors \
        -verbose=2 -qualSizes=gorGor1.qual.sizes \
        trimmed.fastq.gz stdout | gzip -c > gorGor1.withAllNs.fa.gz

    #	And then, a lot of these records are all N's - so, eliminate them:
    faCount gorGor1.fa.gz > gorGor1.faCount.txt
    egrep "^Super|^chr" gorGor1.withAllNs.faCount.txt \
	| awk '{if($2 == $7) { print }}' \
	> all.Ns.names
    egrep "^Super|^chr" gorGor1.withAllNs.faCount.txt \
	| awk '{if($2 != $7) { print }}' \
	> not.all.Ns.names
    cut -f 1 not.all.Ns.names \
	| faSomeRecords gorGor1.withAllNs.fa.gz stdin stdout | gzip -c \
	> gorGor1.fa.gz

    cat << '_EOF_' > cleanQuals.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %allowList;
my $allowListSize = 0;
open (FH, "cut -f 1 not.all.Ns.names|") or
        die "can not cut -f 1 not.all.Ns.names";
while (my $line = <FH>) {
    chomp $line;
    $allowList{$line} = 1;
    ++$allowListSize;
}
close (FH);
print STDERR "working with a list of $allowListSize names\n";

my $allowed = 0;
my $allowedCount = 0;
open (CL, ">clean.gorGor1.qual.fa") or
        die "can not write to clean.gorGor1.qual.fa";
open (FH, "<gorGor1.qual.fa") or die "can not read gorGor1.qual.fa";
while (my $line = <FH>) {
    if ($line =~ m/^>Supercontig_|^>chrM /) {
        my $name = $line;
        chomp $name;
        $name =~ s/^>//;
        $name =~ s/ .*//;
        if (exists($allowList{$name})) {
            $allowed = 1;
            ++$allowedCount;
        } else {
            $allowed = 0;
        }
    }
    if ($allowed) {
        print CL "$line";
    }
}
close (FH);
print STDERR "passed through $allowedCount sequences\n";
'_EOF_'
    # << happy emacs
    ./cleanQuals.pl
# working with a list of 609861 names
# passed through 609861 sequences

    cd /cluster/data/gorGor1/bed/qual
    ln -s ../../sanger/fastq/clean.gorGor1.qual.fa gorGor1.qual.fa
    qaToQac gorGor1.qual.fa gorGor1.qual.qac
    qacToWig -fixed gorGor1.qual.qac stdout | wigEncode stdin qual.{wig,wib}
    #	Converted stdin, upper limit 93.00, lower limit 0.00
    #	real    20m44.535s
    #	loading table:
    rm -f /gbdb/gorGor1/wib/qual.wib
    ln -s /cluster/data/gorGor1/bed/qual/qual.wib /gbdb/gorGor1/wib/
    time hgLoadWiggle -tmpDir=/scratch/tmp \
	-pathPrefix=/gbdb/gorGor1/wib gorGor1 quality qual.wig
    #	real    1m8.385s

############################################################################
