# for emacs: -*- mode: sh; -*-

# Haemonchus contortus
# Sanger Institute and WormBase
#	http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=6289
#	Barber Pole worm
#	http://www.sanger.ac.uk/Projects/H_contortus/
#	ftp://ftp.sanger.ac.uk/pub/pathogens/Haemonchus/contortus/genome/README
#	http://www.nematode.net/Species.Summaries/Haemonchus.contortus/index.php

##############################################################################
## Fetch sequence (DONE - 2010-09-22 - Hiram)

    mkdir -p /hive/data/genomes/haeCon1/ws210
    cd /hive/data/genomes/haeCon1/ws210

    wget --no-parent --timestamping -m -nH --cut-dirs=5 \
	ftp://ftp.sanger.ac.uk/pub/wormbase/WS210/genomes/h_contortus/

    hgFakeAgp -minScaffoldGap=100000 -minContigGap=1 \
	sequences/dna/h_contortus.WS210.dna.fa.gz haeCon1.ucsc.agp

##############################################################################
## Initial browser build (DONE - 2010-09-23 - Hiram)
    cd /hive/data/genomes/haeCon1

    cat << '_EOF_' > haeCon1.config.ra
# Config parameters for makeGenomeDb.pl:
db haeCon1
clade worm
genomeCladePriority 86
scientificName Haemonchus contortus
commonName H. contortus
assemblyDate Dec. 2009
assemblyLabel Sanger Institute
assemblyShortLabel WormBase WS210
orderKey 898
mitoAcc NC_010383.2
fastaFiles
/hive/data/genomes/haeCon1/ws210/sequences/dna/h_contortus.WS210.dna.fa.gz
agpFiles /hive/data/genomes/haeCon1/ws210/haeCon1.ucsc.agp
# qualFiles none
dbDbSpeciesDir worm
taxId 6289
'_EOF_'
    # << happy emacs

    makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-stop=agp haeCon1.config.ra > agp.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-continue=db -stop=db haeCon1.config.ra > db.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -verbose=2 \
	-continue=dbDb haeCon1.config.ra > dbDb.log 2>&1
    #	real    0m2.452s

#########################################################################
# REPEATMASKER (DONE - 2010-09-23 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/haeCon1/bed/repeatMasker
    cd /hive/data/genomes/haeCon1/bed/repeatMasker
    time nice -n +19 doRepeatMasker.pl -noSplit -bigClusterHub=pk \
	-buildDir=`pwd` haeCon1 > do.log 2>&1 &
    #	real    49m9.090s

    #	from the do.log:
    #	RepeatMasker version development-$Id:
    #	RepeatMasker,v 1.25 2010/09/08 21:32:26 angie Exp
    #	CC   RELEASE 20090604;                                            *
    #   June 30 2010 (open-3-2-9) version of RepeatMasker

    cat faSize.rmsk.txt 
    #	297989404 bases (19144420 N's 278844984 real 276673660 upper
    #	2171324 lower) in 59708 sequences in 1 files
    #	%0.73 masked total, %0.78 masked real

#########################################################################
# SIMPLE REPEATS (DONE - 2010-09-23 - Hiram)
    screen 	#	use screen to control the job
    mkdir /hive/data/genomes/haeCon1/bed/simpleRepeat
    cd /hive/data/genomes/haeCon1/bed/simpleRepeat
    time nice -n +19 doSimpleRepeat.pl -workhorse=hgwdev \
	-smallClusterHub=memk -buildDir=`pwd` haeCon1 > do.log 2>&1 &
    #	real    68m5.964s

    cat fb.simpleRepeat 
    #	9422991 bases of 278844984 (3.379%) in intersection

#########################################################################
# run window masker (DONE - 2010-09-23 - Hiram)
    mkdir /hive/data/genomes/haeCon1/bed/windowMasker
    cd /hive/data/genomes/haeCon1/bed/windowMasker
    time doWindowMasker.pl -verbose=2 -workhorse=kolossus \
	-buildDir=`pwd` haeCon1 > do.log 2>&1 &
    #	real    12m36.101s

    twoBitToFa haeCon1.wmsk.sdust.2bit stdout | faSize stdin
    #	297989404 bases (19144420 N's 278844984 real 229219802 upper
    #	49625182 lower) in 59708 sequences in 1 files
    #	%16.65 masked total, %17.80 masked real

    #	load this initial data to get ready to clean it
    hgLoadBed haeCon1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 1245986 elements of size 3
    featureBits haeCon1 windowmaskerSdust
    #	68752934 bases of 278844984 (24.656%) in intersection

    #	eliminate the gaps from the masking
    time featureBits haeCon1 -not gap -bed=notGap.bed
    #	278844984 bases of 278844984 (100.000%) in intersection
    #	real    0m21.941s
    time nice -n +19 featureBits haeCon1 windowmaskerSdust notGap.bed \
        -bed=stdout | gzip -c > cleanWMask.bed.gz
    #	real    25m33.134s
    #	49625182 bases of 278844984 (17.797%) in intersection

    #	reload track to get it clean
    hgLoadBed haeCon1 windowmaskerSdust cleanWMask.bed.gz
    #	Loaded 1227338  elements of size 3
    time featureBits haeCon1 windowmaskerSdust \
	> fb.haeCon1.cleanWMask.txt 2>&1 &
    #	real    0m34.903s
    cat fb.haeCon1.cleanWMask.txt 
    #	49625182 bases of 278844984 (17.797%) in intersection

    cd /hive/data/genomes/haeCon1
    #	mask the sequence with this clean mask
    zcat bed/windowMasker/cleanWMask.bed.gz \
	| twoBitMask haeCon1.unmasked.2bit stdin \
	    -type=.bed haeCon1.2bit
    twoBitToFa haeCon1.2bit stdout | faSize stdin \
        > haeCon1.faSize.txt
    cat haeCon1.faSize.txt
    #	297989404 bases (19144420 N's 278844984 real 229219802 upper
    #	49625182 lower) in 59708 sequences in 1 files
    #	%16.65 masked total, %17.80 masked real

    ln -s `pwd`/haeCon1.2bit /gbdb/haeCon1/haeCon1.2bit

########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2010-09-22 - Hiram)
    # numerator is haeCon1 gapless bases "real" as reported by faSize 
    # denominator is hg19 gapless bases as reported by featureBits,
    #	featureBits hg19 gap
    # 1024 is threshold used for human -repMatch:
    calc \( 278844984 / 2897310462 \) \* 1024
    #	( 278844984 / 2897310462 ) * 1024 = 98.552526
    # rounding up 100 is good enough for this one

    cd /hive/data/genomes/haeCon1
    blat haeCon1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=haeCon1.11.ooc -repMatch=100
    #	Wrote 7849 overused 11-mers to haeCon1.11.ooc

    mkdir /hive/data/staging/data/haeCon1
    cp -p haeCon1.2bit chrom.sizes haeCon1.11.ooc \
	/hive/data/staging/data/haeCon1

########################################################################
# LASTZ SWAP ce9 (DONE - 2010-09-23 - Hiram)
    # original alignment
    cd /hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23
    cat fb.ce9.chainHaeCon1Link.txt 
    #	7484042 bases of 100286004 (7.463%) in intersection

    # recip best to see how it looks
    time doRecipBest.pl -workhorse=hgwdev -verbose=2 -buildDir=`pwd` \
	ce9 haeCon1 > rbest.log 2>&1 &
    #	real    4m5.408s

    #	swap, this is also in haeCon1.txt
    mkdir /hive/data/genomes/haeCon1/bed/blastz.ce9.swap
    cd /hive/data/genomes/haeCon1/bed/blastz.ce9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/ce9/bed/blastzHaeCon1.2010-09-23/DEF \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk \
	-workhorse=hgwdev -smallClusterHub=memk -swap > swap.log 2>&1 &
    #	real    7m35s

    cat fb.haeCon1.chainCe9Link.txt
    #	8860569 bases of 278844984 (3.178%) in intersection

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2010-09-27 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # edit etc/genbank.conf to add haeCon1 just before caeJap2

# haeCon1 (Haemonchus contortus)
haeCon1.serverGenome = /hive/data/genomes/haeCon1/haeCon1.2bit
haeCon1.clusterGenome = /scratch/data/haeCon1/haeCon1.2bit
haeCon1.ooc = /scratch/data/haeCon1/haeCon1.11.ooc
haeCon1.lift = no
haeCon1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
haeCon1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
haeCon1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
haeCon1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
haeCon1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
haeCon1.refseq.mrna.native.load = yes
haeCon1.refseq.mrna.xeno.load  = yes  
haeCon1.refseq.mrna.xeno.loadDesc = yes
haeCon1.genbank.mrna.xeno.load = yes
haeCon1.genbank.est.native.load = yes
haeCon1.genbank.est.native.loadDesc = no
haeCon1.downloadDir = haeCon1
haeCon1.perChromTables = no

    git commit -m "Added haeCon1 H. contortus WS210" etc/genbank.conf
    git push
    # update /cluster/data/genbank/:
    make etc-update

    ssh genbank
    screen		#	use a screen to manage this job
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial haeCon1 &
    #	logFile: var/build/logs/2010.09.27-13:30:01.haeCon1.initalign.log
    #	real    737m20.406s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad haeCon1
    #	logFile: var/dbload/hgwdev/logs/2010.09.28-09:59:58.dbload.log
    #	real    23m57.519s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    git pull
    # add haeCon1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    git commit -m "Added haeCon1" etc/align.dbs etc/hgwdev.dbs
    git push
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2008-06-04 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("haeCon1", "blat5", "17792", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("haeCon1", "blat5", "17793", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

############################################################################
# reset default position to ZC101.2e protein blat result
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="Hcon_Contig0059139:22,342-52,602"
	where name="haeCon1";' hgcentraltest

############################################################################
# ELEGANS (ce9) PROTEINS TRACK (DONE - 2010-10-07 - Hiram)
    cd /hive/data/genomes/haeCon1
    mkdir blastDb
    twoBitToFa haeCon1.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	59708 pieces of 59708 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/scratch/data/blast-2.2.11/bin/formatdb -i $i -p F
    done
    rm *.fa

    ## create the query protein set
    mkdir -p /hive/data/genomes/haeCon1/bed/tblastn.ce9SG
    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG
    echo  /hive/data/genomes/haeCon1/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
    # 59708 query.lst

   # we want around 400000 jobs
    calc `wc -l /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl  | cut -d" " -f1`/\(400000/`wc -l query.lst | cut -d" " -f1`\)
    #	28103/(400000/59708) = 4194.934810

   mkdir sgfa
   split -l 4000 /hive/data/genomes/ce9/bed/blat.ce9SG/ce9SG.psl sgfa/sg
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
DB=haeCon1
BLASTMAT=/scratch/data/blast-2.2.11/data
SCR="/scratch/tmp/${DB}"
g=`basename $2`
D=`basename $1`
export BLASTMAT DB SCR g D
mkdir -p ${SCR}
cp -p $1.* ${SCR}
f=${SCR}/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/data/blast-2.2.11/bin/blastall -M BLOSUM80 -m 0 -F no \
	-e $eVal -p tblastn -d ${SCR}/$D -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/x86_64/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 \
	    /hive/data/genomes/${DB}/blastDb.lft carry $f.2 > /dev/null
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp \
	    /hive/data/genomes/ce9/bed/blat.ce9SG/protein.lft warn $f.3 > /dev/null
        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4 ${SCR}/$D.*
            rmdir --ignore-fail-on-non-empty ${SCR}
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4 ${SCR}/$D.*
rmdir --ignore-fail-on-non-empty ${SCR}
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh swarm
    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
    para try, check, push, check etc.
# Completed: 477664 of 477664 jobs
# CPU time in finished jobs:   25077216s  417953.60m  6965.89h  290.25d  0.795 y
# IO & Wait Time:               3647195s   60786.59m  1013.11h   42.21d  0.116 y
# Average job time:                  60s       1.00m     0.02h    0.00d
# Longest finished job:             241s       4.02m     0.07h    0.00d
# Submission to last job:         38732s     645.53m    10.76h    0.45d

    # do the cluster run for chaining
    ssh swarm
    mkdir /hive/data/genomes/haeCon1/bed/tblastn.ce9SG/chainRun
    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1) {check out exists+ ../blastOut/c.$(file1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
#!/bin/csh -fe
cd $1
set b = $1:t
cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin \
/hive/data/genomes/haeCon1/bed/tblastn.ce9SG/blastOut/c.$b.psl
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /hive/data/genomes/haeCon1/bed/tblastn.ce9SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG/chainRun
    para create jobList
    para try, check, push, check etc.
# Completed: 8 of 8 jobs
# CPU time in finished jobs:        489s       8.15m     0.14h    0.01d  0.000 y
# IO & Wait Time:                  8958s     149.30m     2.49h    0.10d  0.000 y
# Average job time:                1181s      19.68m     0.33h    0.01d
# Longest finished job:            1187s      19.78m     0.33h    0.01d
# Submission to last job:          1192s      19.87m     0.33h    0.01d

    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> ../blastCe9SG.psl
    cd ..
    pslCheck blastCe9SG.psl
    #	checked: 21531 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /hive/data/genomes/haeCon1/bed/tblastn.ce9SG
    hgLoadPsl haeCon1 blastCe9SG.psl

    # check coverage
    featureBits haeCon1 blastCe9SG
    #	4990746 bases of 278844984 (1.790%) in intersection
    featureBits cb3 blastCe9SG
    #	18490367 bases of 108433446 (17.052%) in intersection
    featureBits caeRem3 blastCe9SG
    #	20302540 bases of 138406388 (14.669%) in intersection
    featureBits caePb2 blastCe9SG
    #	23730009 bases of 170473138 (13.920%) in intersection
    featureBits caeJap3 blastCe9SG
    #	12894398 bases of 154057934 (8.370%) in intersection
    featureBits melHap1 blastCe9SG
    #	4376245 bases of 53017507 (8.254%) in intersection
    featureBits melInc1 blastCe9SG
    #	3882043 bases of 82095019 (4.729%) in intersection
    featureBits priPac2 blastCe9SG
    #	5436779 bases of 133634773 (4.068%) in intersection
    featureBits bruMal1 blastCe9SG 
    #	4424694 bases of 89235536 (4.958%) in intersection
    featureBits ce9 sangerGene
    #	28689552 bases of 100286004 (28.608%) in intersection

    rm -rf blastOut

#########################################################################
# verify all.joiner has everything (DONE - 2010-10-21 - Hiram)
    # edit all.joiner until all these commands are successful
    cd $HOME/kent/src/hg/makeDb/schema
    joinerCheck -tableCoverage -database=haeCon1 ./all.joiner
    joinerCheck -keys -database=haeCon1 ./all.joiner
    joinerCheck -times -database=haeCon1 ./all.joiner
    joinerCheck -all -database=haeCon1 ./all.joiner
    #	the -all command will complain about other databases on hgwdev
    #	that are not specified in all.joiner.  There are a lot of test
    #	databases on hgwdev

#########################################################################
# construct downloads files (DONE - 2010-10-21 - Hiram)
    cd /hive/data/genomes/haeCon1
    makeDownloads.pl -dbHost=hgwdev -workhorse=hgwdev -verbose=2 haeCon1 \
	 > downloads.log 2>&1

#########################################################################
## Creating pushQ (DONE - 2010-10-21 - Hiram)
    ssh hgwdev
    mkdir /hive/data/genomes/haeCon1/pushQ
    cd /hive/data/genomes/haeCon1/pushQ
    makePushQSql.pl haeCon1 > haeCon1.sql 2> errorLog.out

    ## check the errorLog.out for anything that needs to be fixed
    ## copy haeCon1.sql to hgwbeta:/tmp
    ## then on hgwbeta:
    hgsql qapushq < haeCon1.sql

#######################################################################
