# for emacs: -*- mode: sh; -*-


# Drosophila mojavensis -- 
# 
# Agencourt's 1 August 2005 assembly
#

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-------------+---------------------------------+
#  | tableName   | type                            |
#  +-------------+---------------------------------+
#  | xenoRefGene | genePred xenoRefPep xenoRefMrna |
#  | geneMapper  | genePred                        |
#  | genscan     | genePred genscanPep             |
#  +-------------+---------------------------------+



# DOWNLOAD SEQUENCE (DONE 8/5/05 angie)
    ssh kkstore02
    mkdir /cluster/store11/droMoj2
    cd /cluster/data
    ln -s /cluster/store11/droMoj2 droMoj2
    cd /cluster/data/droMoj2
    mkdir jkStuff bed
    mkdir downloads
    cd downloads
    wget http://rana.lbl.gov/drosophila/assemblies/dmoj_agencourt_arachne_01aug05.tar.gz
    tar xvzf dmoj_agencourt_arachne_01aug05.tar.gz
    cd agencourt_arachne_01aug2005
    faSize scaffolds.fa
#194270144 bases (13750513 N's 180519631 real 180519631 upper 0 lower) in 6843 sequences in 1 files
#Total size: mean 28389.6 sd 757113.0 min 101 (scaffold_5710) max 34172700 (scaffold_6540) median 1671
#N count: mean 2009.4 sd 16940.3
#U count: mean 26380.2 sd 746670.3
#L count: mean 0.0 sd 0.0


# PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 8/5/05 angie)
    # Chop up large scaffolds into ~500kb chunks for RepeatMasker, 
    # then glom the tiny scaffolds up into ~500k collections (looks like 
    # some almost-500k pieces are glommed together --> a few almost-1M chunks,
    # but that's OK).
    ssh kkstore02
    cd /cluster/data/droMoj2
    mkdir scaffoldsSplit
    mv downloads/agencourt_arachne_01aug2005/scaffolds.fa .
    faSplit size scaffolds.fa 500000 -oneFile \
      scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft
    mkdir chunks500k
    faSplit about scaffoldsSplit.fa 500000 chunks500k/chunk_


# CREATING DATABASE (DONE 8/5/05 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1     /dev/sdc1             1.8T  970G  690G  59% /var/lib/mysql
    hgsql '' -e 'create database droMoj2'
    # Copy the table "grp" from an existing database to the new database
    hgsql droMoj2 -e 'create table grp (PRIMARY KEY(NAME)) select * from dm2.grp'


# RUN REPEAT MASKER (DONE 8/5/05 angie)
    ssh kkstore02
    cat /cluster/bluearc/RepeatMasker/Libraries/version 
#RepBase Update 9.11, RM database version 20050112
    # make the run directory, output directory, and job list
    cd /cluster/data/droMoj2
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/droMoj2/$2
/bin/cp ../chunks500k/$2 /tmp/droMoj2/$2/
pushd /tmp/droMoj2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/droMoj2/$2/$2.out ./
/bin/rm -fr /tmp/droMoj2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droMoj2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droMoj2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun RMOut
    cp /dev/null RMRun/RMJobs
    foreach f ( chunks500k/*.fa )
      set chunk = $f:t
      echo ../jkStuff/RMDrosophila \
           /cluster/data/droMoj2/RMOut $chunk \
           '{'check in line+ /cluster/data/droMoj2/$f'}' \
         '{'check out line+ /cluster/data/droMoj2/RMOut/$chunk.out'}' \
      >> RMRun/RMJobs
    end

    # do the run
    ssh kk9
    cd /cluster/data/droMoj2/RMRun
    para make RMJobs
    para time
#Completed: 379 of 379 jobs
#Average job time:                3453s      57.55m     0.96h    0.04d
#Longest finished job:            5584s      93.07m     1.55h    0.06d
#Submission to last job:         16519s     275.32m     4.59h    0.19d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh kkstore02
    cd /cluster/data/droMoj2
    foreach f (RMOut/*.fa.out)
      liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null
    end
    # Make a consolidated scaffold .out file too:
    head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out
    foreach f (RMOut/*.scaf.out)
      tail +4 $f >> RMOut/scaffolds.fa.out 
    end
    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut droMoj2 /cluster/data/droMoj2/RMOut/scaffolds.fa.out
    # hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk, 
    # but we want a non-split with no prefix table:
    hgsql droMoj2 -e 'rename table scaffolds_rmsk to rmsk'
    # Fix up the indices too:
    hgsql droMoj2 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(7), bin); \
                  create index genoStart on rmsk (genoName(7), genoStart);'


# EXTRACT AGP FROM ASSEMBLY.LINKS FILE (DONE 1/23/06 angie)
    ssh hgwdev
    cd /cluster/data/droMoj2
    # assembly.links includes negative gap values which are supposed to 
    # indicate overlap between contigs, but when scaffold sequences were 
    # generated, Mike Eisen chose to use +25 instead of the negative values.
    # So replace negative gap values with 25 for consistency with the 
    # scaffold sequences, then translate to AGP:
    perl -wpe 'next if (/^#/); @w = split; \
               $w[6] = 25 if ($w[6] < 0); $w[7] = 25 if ($w[7] < 0); \
               $_ = join("\t", @w) . "\n";' \
      downloads/agencourt_arachne_01aug2005/assembly.links \
    | ~/kent/src/utils/arachneLinksToAgp.pl \
      > scaffolds.agp
    nice checkAgpAndFa scaffolds.agp scaffolds.fa | tail
    hgGoldGapGl -noGl droMoj2 scaffolds.agp


# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 8/5/04 angie)
    ssh kkstore02
    mkdir /cluster/data/droMoj2/bed/fakeAgp
    cd /cluster/data/droMoj2/bed/fakeAgp
    faGapSizes ../../scaffolds.fa \
        -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
    # A disproportionately large number of gaps are exactly 25bp long, so
    # hgFakeAgp's default -minContigGap of 25 will be fine.  
    hgFakeAgp ../../scaffolds.fa fake.agp
    ssh hgwdev
    hgLoadGap -unsplit droMoj2 /cluster/data/droMoj2/bed/fakeAgp/fake.agp


# SIMPLE REPEATS (TRF) (DONE 8/5/05 angie)
    ssh kolossus
    mkdir /cluster/data/droMoj2/bed/simpleRepeat
    cd /cluster/data/droMoj2/bed/simpleRepeat
    nice trfBig -trf=/cluster/bin/i386/trf ../../scaffolds.fa \
      /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \
    |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
    > trf.log &
    # check on this with
    tail -f trf.log

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed droMoj2 simpleRepeat \
      /cluster/data/droMoj2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 8/5/05 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kkstore02
    cd /cluster/data/droMoj2/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 8/5/05 angie)
    ssh kkstore02
    cd /cluster/data/droMoj2
    maskOutFa -soft scaffolds.fa bed/simpleRepeat/trfMask.bed \
      scaffolds.fa
    maskOutFa -softAdd scaffolds.fa RMOut/scaffolds.fa.out scaffolds.fa
    # Now clean up the unmasked split scaffolds to avoid confusion later.
    rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 8/5/05 angie)
    # Translate to 2bit
    ssh kkstore02
    cd /cluster/data/droMoj2
    faToTwoBit scaffolds.fa droMoj2.2bit
    # Make chromInfo.tab.
    mkdir bed/chromInfo
    twoBitInfo droMoj2.2bit stdout \
    | awk '{printf "%s\t%s\t/gbdb/droMoj2/droMoj2.2bit\n", $1, $2;}' \
    > bed/chromInfo/chromInfo.tab

    # Make symbolic a link from /gbdb/droMoj2 to the 2bit.
    ssh hgwdev
    mkdir -p /gbdb/droMoj2
    ln -s /cluster/data/droMoj2/droMoj2.2bit /gbdb/droMoj2/
    # Load chromInfo table.
    hgsql droMoj2 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql droMoj2 -e 'load data local infile \
      "/cluster/data/droMoj2/bed/chromInfo/chromInfo.tab" into table chromInfo'
    # Make chrom.sizes from chromInfo contents and check scaffold count.
    hgsql droMoj2 -N -e 'select chrom,size from chromInfo' \
    > /cluster/data/droMoj2/chrom.sizes
    wc -l /cluster/data/droMoj2/chrom.sizes
#   6843 /cluster/data/droMoj2/chrom.sizes


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 8/5/05 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    hgsql -h genome-testdb hgcentraltest -e \
       'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("droMoj2", "Aug. 2005", "/gbdb/droMoj2", "D. mojavensis", \
             "scaffold_6500:23838548-23875233", 1, 57, \
             "D. mojavensis", \
             "Drosophila mojavensis", "/gbdb/droMoj2/html/description.html", \
             0, 0, "Agencourt 1 August 2005");'
    # This is not the first droAna, so defaultDb and genomeClade already 
    # have entries for D. mojavensis.

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit trackDb/makefile to add droMoj2 to the DBS variable.
    mkdir drosophila/droMoj2
    # Create a simple drosophila/droMoj2/description.html file.
    cvs add drosophila/droMoj2
    cvs add drosophila/droMoj2/description.html
    make update DBS=droMoj2 ZOO_DBS=

    # go public on genome-test
    cvs ci makefile
    cvs ci drosophila/droMoj2
    mkdir /gbdb/droMoj2/html
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha


# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 8/5/05 angie)
    # First, agglomerate small scaffolds into chunks of ~100k median 
    # (many scaffolds are larger than that) so we don't have too many 
    # files for one dir, but keep a reasonably low job run time:
    ssh kkstore02
    cd /cluster/data/droMoj2
    mkdir chunksUnsplit
    faSplit about scaffolds.fa 100000 chunksUnsplit/chunk_
    ssh kkr1u00
    mkdir /iscratch/i/droMoj2
    rsync -av /cluster/data/droMoj2/chunksUnsplit /iscratch/i/droMoj2/
    rsync -av /cluster/data/droMoj2/droMoj2.2bit /iscratch/i/droMoj2/
    iSync


# PRODUCING GENSCAN PREDICTIONS (DONE 8/5/05 angie)
    ssh kkstore02
    # Make hard-masked scaffolds and split up for processing:
    cd /cluster/data/droMoj2
    maskOutFa scaffolds.fa hard scaffolds.fa.masked
    mkdir chunksUnsplitMasked
    faSplit about scaffolds.fa.masked 100000 chunksUnsplitMasked/chunk_
    mkdir /cluster/data/droMoj2/bed/genscan
    cd /cluster/data/droMoj2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    ls -1S ../../chunksUnsplitMasked/chunk*.fa > chunks.list
    cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    ssh kki
    cd /cluster/data/droMoj2/bed/genscan
    gensub2 chunks.list single gsub jobList
    para make jobList
    para time
#Completed: 304 of 304 jobs
#Average job time:                  24s       0.40m     0.01h    0.00d
#Longest finished job:            1097s      18.28m     0.30h    0.01d
#Submission to last job:          1097s      18.28m     0.30h    0.01d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    
    # Concatenate scaffold-level results:
    ssh kkstore02
    cd /cluster/data/droMoj2/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat subopt/*.bed > genscanSubopt.bed
    cat pep/*.pep > genscan.pep
    # Clean up
    rm -r /cluster/data/droMoj2/chunksUnsplitMasked

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/droMoj2/bed/genscan
    ldHgGene -gtf droMoj2 genscan genscan.gtf
    hgPepPred droMoj2 generic genscanPep genscan.pep
    hgLoadBed droMoj2 genscanSubopt genscanSubopt.bed


# MAKE DOWNLOADABLE FILES (DONE 8/5/05 angie)
    ssh kkstore02
    mkdir /cluster/data/droMoj2/zips
    cd /cluster/data/droMoj2
    gzip -c RMOut/scaffolds.fa.out > zips/scaffoldOut.gz
    gzip -c scaffolds.fa > zips/scaffoldFa.gz
    gzip -c scaffolds.fa.masked > zips/scaffoldFaMasked.gz
    gzip -c bed/simpleRepeat/trfMask.bed > zips/scaffoldTrf.gz
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droMoj2
    cd /usr/local/apache/htdocs/goldenPath/droMoj2
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cd bigZips
    ln -s /cluster/data/droMoj2/zips/*.gz .
    nice md5sum *.gz > md5sum.txt


# MAKE 11.OOC FILE FOR BLAT (DONE 8/5/05 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kkr1u00
    mkdir /cluster/bluearc/droMoj2
    blat /cluster/data/droMoj2/droMoj2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/droMoj2/11.ooc -repMatch=100
#Wrote 17296 overused 11-mers to /cluster/bluearc/droMoj2/11.ooc
    cp -p /cluster/bluearc/droMoj2/*.ooc /iscratch/i/droMoj2/
    iSync


# AUTO UPDATE GENBANK MRNA RUN (TODO 8/5/05 angie)
    ssh hgwdev

    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

    # Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
# droMoj2 (D. mojavensis)
droMoj2.genome = /iscratch/i/droMoj2/droMoj2.2bit
droMoj2.mondoTwoBitParts = 1000
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.75 -nearTop=0.005
droMoj2.genbank.mrna.xeno.load = yes
# GenBank has no D. mojavensis ESTs at this point... that may change.
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no

    cvs ci etc/genbank.conf
    # Update src/align/gbBlat to use /iscratch/i/droMoj2/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat

    # Install to /cluster/data/genbank:
    make install-server

    ssh `fileServer /cluster/data/genbank/`
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) RefSeq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droMoj2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droMoj2
    featureBits droMoj2 xenoRefGene
#16520385 bases of 165766797 (9.966%) in intersection
    # Clean up:
    rm -rf work/initial.droMoj2

    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droMoj2 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droMoj2
    featureBits droMoj2 all_mrna
#19602 bases of 165766797 (0.012%) in intersection
    featureBits droMoj2 xenoMrna
#17295487 bases of 165766797 (10.434%) in intersection
    # Clean up:
    rm -rf work/initial.droMoj2


# SWAP CHAINS FROM DM2, BUILD NETS ETC. (IN PROGRESS 8/9/05 angie)
    mkdir /cluster/data/droMoj2/bed/blastz.dm2.swap
    cd /cluster/data/droMoj2/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droMoj2/DEF \
      >& do.log
    echo "check /cluster/data/droMoj2/bed/blastz.dm2.swap/do.log" \
    | mail -s "check do.log" $USER
    # Add {chain,net}Dm2 to trackDb.ra if necessary.


# MAKE GCPERCENT (DONE 8/9/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droMoj2/bed/gc5Base
    cd /cluster/data/droMoj2/bed/gc5Base
    hgGcPercent -wigOut -doGaps -file=stdout -win=5 -verbose=2 droMoj2 \
       /cluster/data/droMoj2 | wigEncode stdin gc5Base.wig gc5Base.wib
    mkdir /gbdb/droMoj2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/droMoj2/wib
    hgLoadWiggle -pathPrefix=/gbdb/droMoj2/wib droMoj2 gc5Base gc5Base.wig


# MAKE THIS THE DEFAULT ASSEMBLY WHEN THERE ARE ENOUGH TRACKS (TODO 8/?/05 angie)
    hgsql -h genome-testdb hgcentraltest -e \
      'UPDATE defaultDb set name = "droMoj2" where genome = "D. mojavensis";'

# MAKE Drosophila Proteins track (DONE braney 2005-08-20)
    ssh kkstore02
    mkdir -p /cluster/data/droMoj2/blastDb
    cd /cluster/data/droMoj2/blastDb
    faSplit sequence ../scaffolds.fa 400 x
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/droMoj2/blastDb
    cp /cluster/data/droMoj2/blastDb/* /iscratch/i/droMoj2/blastDb
    iSync > sync.out
    
    ssh kk
    mkdir -p /cluster/data/droMoj2/bed/tblastn.dm2FB
    cd /cluster/data/droMoj2/bed/tblastn.dm2FB
    ls -1S /iscratch/i/droMoj2/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(80000/`wc target.lst | awk "{print \\\$1}"`\)
# 18929/(80000/396) = 93.698550

    split -l 94 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/droMoj2/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/droMoj2/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 79991 of 79992 jobs
# para.results: file not found.  paraHub can't write to this dir?
# CPU time in finished jobs:    5722807s   95380.12m  1589.67h   66.24d  0.181 y
# IO & Wait Time:                517066s    8617.76m   143.63h    5.98d  0.016 y
# Average job time:                  78s       1.30m     0.02h    0.00d
# Longest finished job:           44845s     747.42m    12.46h    0.52d
# Submission to last job:         78192s    1303.20m    21.72h    0.91d

    ssh pk
    cd /cluster/data/droMoj2/bed/tblastn.dm2FB
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para maxNode 20
    para push
 
# Completed: 202 of 202 jobs
# CPU time in finished jobs:      27335s     455.59m     7.59h    0.32d  0.001 y
# IO & Wait Time:                  3390s      56.50m     0.94h    0.04d  0.000 y
# Average job time:                 152s       2.54m     0.04h    0.00d
# Longest finished job:            4228s      70.47m     1.17h    0.05d
# Submission to last job:          4228s      70.47m     1.17h    0.05d

    ssh kkstore02
    cd /cluster/data/droMoj2/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droMoj2/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..
    wc blastDm2FB.psl
# 19158  402318 5264815 blastDm2FB.psl
    pslUniq blastDm2FB.psl stdout | wc                                                                                    
#  17958  377118 5085091
    cat fbfa/*fa | grep ">" | wc
# 82338   82338 1300520

    ssh hgwdev
    cd /cluster/data/droMoj2/bed/tblastn.dm2FB
    hgLoadPsl droMoj2 blastDm2FB.psl
    featureBits droMoj2 blastDm2FB
# 18239436 bases of 180520992 (10.104%) in intersection

    exit

    # back to kkstore02
    rm -rf blastOut

# End tblastn

# AUTO UPDATE GENBANK MRNA AND EST GENES RUN (DONE, 2005-08-22, markd)
    # align with revised genbank process
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add droMoj2, had to run on pk, due to kk
    # being down.  Set temporary locations for server files
# droMoj2 (D. mojavensis)
# genbank has 1 mRNA and 0 ESTs at time of initial build
droMoj2.serverGenome = /cluster/data/droMoj2/droMoj2.2bit
##droMoj2.clusterGenome = /iscratch/i/droMoj2/droMoj2.2bit
##droMoj2.ooc = /iscratch/i/droMoj2/11.ooc
droMoj2.clusterGenome = /san/sanvol1/scratch/droMoj2/droMoj2.2bit
droMoj2.ooc = /san/sanvol1/scratch/droMoj2/11.ooc
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.refseq.mrna.xeno.pslCDnaFilter = -minCover=0.15 -coverNearTop=0.005 -minId=0.75 -idNearTop=0.005 -maxRepMatch=0.4 -bestOverlap
droMoj2.genbank.mrna.xeno.load = yes
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial droMoj2 &

    # when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  droMoj2&

# AUTO UPDATE GENBANK MRNA AND EST GENES RUN (DONE, 2005-08-31, markd)
    # redo genbank revised alignment procedure once again to
    # pickup local near best pslCDnaFilter

    # align with revised genbank process
    cd ~kent/src/makeDb/genbank
    cvs update -d etc
    # edit etc/genbank.conf to add droMoj2
# droMoj2 (D. mojavensis)
# genbank has 1 mRNA and 0 ESTs at time of initial build
droMoj2.serverGenome = /cluster/data/droMoj2/droMoj2.2bit
droMoj2.clusterGenome = /iscratch/i/droMoj2/droMoj2.2bit
droMoj2.ooc = /iscratch/i/droMoj2/11.ooc
droMoj2.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
droMoj2.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
droMoj2.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
droMoj2.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
droMoj2.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
droMoj2.genbank.est.xeno.pslCDnaFilter    = ${lowCover.genbank.est.xeno.pslCDnaFilter}
droMoj2.lift = no
droMoj2.refseq.mrna.native.load = no
droMoj2.refseq.mrna.xeno.load = yes
droMoj2.genbank.mrna.xeno.load = yes
droMoj2.genbank.est.native.load = no
droMoj2.genbank.est.xeno.load = no
droMoj2.downloadDir = droMoj2
droMoj2.perChromTables = no

    # update /cluster/data/genbank/
    make etc-update

    ssh kkstore02
    cd /cluster/data/genbank
    nice bin/gbAlignStep -initial droMoj2 &

    # when finished
    ssh hgwdev
    cd /cluster/data/genbank
    nice ./bin/gbDbLoadStep -drop -initialLoad  droMoj2&


# GENEMAPPER PREDICTIONS FROM UCB (DONE 1/24/06 angie)
    ssh hgwdev
    mkdir /cluster/data/droMoj2/bed/geneMapper
    cd /cluster/data/droMoj2/bed/geneMapper
    wget http://bio.math.berkeley.edu/genemapper/GFF/rel0.2/DroMoj_20050801.gff
    # Don't use -genePredExt... there are no start/stop_codon items, so 
    # all get marked "incmpl", and name2 always gets the same value as name.
    # Get rid of custom track header lines:
    egrep -v '^(track|browser) ' DroMoj_20050801.gff \
    | ldHgGene -gtf droMoj2 geneMapper stdin


###########################################################################
# LIFTOVER TO DROMOJ3 (DONE 3/16/09 angie)
    doSameSpeciesLiftOver.pl droMoj2 droMoj3 -debug \
      -ooc=/hive/data/staging/data/droMoj2/11.ooc -workhorse=kolossus
# *** Steps were performed in /hive/data/genomes/droMoj2/bed/blat.droMoj3.2009-03-16
    cd /hive/data/genomes/droMoj2/bed/blat.droMoj3.2009-03-16/
    doSameSpeciesLiftOver.pl droMoj2 droMoj3 \
      -ooc=/hive/data/staging/data/droMoj2/11.ooc -workhorse=kolossus


###########################################################################
