# for emacs: -*- mode: sh; -*-


# Drosophila virilis -- 
# 
# Agencourt's 12 July 2004 assembly
# 
#


# DOWNLOAD SEQUENCE (DONE 11/5/04 angie)
    ssh kksilo
    mkdir /cluster/store8/droVir1
    cd /cluster/data
    ln -s /cluster/store8/droVir1 droVir1
    cd /cluster/data/droVir1
    mkdir jkStuff bed
    mkdir downloads
    cd downloads
    wget http://rana.lbl.gov/drosophila/assemblies/dvir_agencourt_arachne_12july04.tar.gz
    tar xvzf dvir_agencourt_arachne_12july04.tar.gz
    faSize contigs.fa
#189176135 bases (0 N's 189176135 real) in 34715 sequences in 1 files
#Total size: mean 5449.4 sd 16095.1 min 90 (contig_34714) max 324616 (contig_3061) median 1204
#N count: mean 0.0 sd 0.0
    faSize scaffolds.fa
#196642585 bases (7466450 N's 189176135 real) in 27948 sequences in 1 files
#Total size: mean 7036.0 sd 145953.7 min 90 (scaffold_27947) max 8062012 (scaffold_0) median 1089
#N count: mean 267.2 sd 6676.2
    rm contigs.fa


# PARTITION SCAFFOLDS FOR REPEATMASKER RUN (DONE 11/5/04 angie)
    # Split large scaffolds into 500000-base pieces.  
    # Agglomerate the small scaffolds up into ~500k collections.
    ssh kksilo
    cd /cluster/data/droVir1
    faSplit size downloads/scaffolds.fa 500000 -oneFile \
      scaffoldsSplit -lift=jkStuff/scaffoldsSplit.lft
    mkdir chunks
    faSplit about scaffoldsSplit.fa 400000 chunks/chunk_


# RUN REPEAT MASKER (DONE 11/7/04 angie)
    # January ("March") '04 version of RepeatMasker and libs.
    # make the run directory, output directory, and job list
    ssh kksilo
    cd /cluster/data/droVir1
    cat << '_EOF_' > jkStuff/RMDrosophila
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/droVir1/$2
/bin/cp ../chunks/$2 /tmp/droVir1/$2/
pushd /tmp/droVir1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -spec drosophila $2
popd
/bin/cp /tmp/droVir1/$2/$2.out ./
/bin/rm -fr /tmp/droVir1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/droVir1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/droVir1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMDrosophila
    mkdir RMRun RMOut
    cp /dev/null RMRun/RMJobs
    foreach f ( chunks/*.fa )
      set chunk = $f:t
      echo ../jkStuff/RMDrosophila \
           /cluster/data/droVir1/RMOut $chunk \
           '{'check in line+ /cluster/data/droVir1/$f'}' \
         '{'check out line+ /cluster/data/droVir1/RMOut/$chunk.out'}' \
      >> RMRun/RMJobs
    end

    # do the run
    ssh kk9
    cd /cluster/data/droVir1/RMRun
    para create RMJobs
    para try, check, push, check,...
#Completed: 400 of 400 jobs
#Average job time:                2895s      48.25m     0.80h    0.03d
#Longest job:                     6550s     109.17m     1.82h    0.08d
#Submission to last job:         17280s     288.00m     4.80h    0.20d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh kksilo
    cd /cluster/data/droVir1
    foreach f (RMOut/*.fa.out)
      liftUp $f:r:r.scaf.out jkStuff/scaffoldsSplit.lft warn $f > /dev/null
    end
    # Make a consolidated scaffold .out file too:
    head -3 RMOut/chunk_00.fa.out > RMOut/scaffolds.fa.out
    foreach f (RMOut/*.scaf.out)
      tail +4 $f >> RMOut/scaffolds.fa.out 
    end
    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut droVir1 /cluster/data/droVir1/RMOut/scaffolds.fa.out
    # hgLoadOut made a "scaffolds_rmsk" table even with -table=rmsk, 
    # but we want a non-split with no prefix table:
    hgsql droVir1 -e 'rename table scaffolds_rmsk to rmsk'
    # Fix up the indices too:
    hgsql droVir1 -e 'drop index bin       on rmsk; \
                  drop index genoStart on rmsk; \
                  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(14), bin); \
                  create index genoStart on rmsk (genoName(14), genoStart); \
                  create index genoEnd   on rmsk (genoName(14), genoEnd);'


# CREATING DATABASE (DONE 11/5/04 angie)
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
#/dev/sdc1             1.8T  640G 1021G  39% /var/lib/mysql
    hgsql '' -e 'create database droVir1'
    # Copy all the data from the table "grp" 
    # in an existing database to the new database
    ssh hgwdev
    hgsql droVir1 -e 'create table grp (PRIMARY KEY(NAME)) select * from hg17.grp'


# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie)
    ssh kksilo
    mkdir /cluster/data/droVir1/bed/fakeAgp
    cd /cluster/data/droVir1/bed/fakeAgp
    faGapSizes ../../downloads/scaffolds.fa \
      -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
    # Wow, quite a few blocks of N's are exactly 25bp long.  That is 
    # probably the min gap size for the assembly, and it also happens to 
    # be the default value of -minContigGap for hgFakeAgp, so run with it:
    hgFakeAgp ../../downloads/scaffolds.fa fake.agp
    ssh hgwdev
    hgLoadGap -unsplit droVir1 /cluster/data/droVir1/bed/fakeAgp/fake.agp


# SIMPLE REPEATS (TRF) (DONE 11/5/04 angie)
    ssh kksilo
    mkdir /cluster/data/droVir1/bed/simpleRepeat
    cd /cluster/data/droVir1/bed/simpleRepeat
    nice trfBig -trf=/cluster/bin/i386/trf \
      ../../downloads/scaffolds.fa \
      /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp \
    |& egrep -v '^(Removed|Tandem|Copyright|Loading|Allocating|Initializing|Computing|Scanning|Freeing)' \
    > trf.log &
    # check on this with
    tail -f trf.log

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed droVir1 simpleRepeat \
      /cluster/data/droVir1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 11/5/04 angie)
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/droVir1/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 11/7/04 angie)
    ssh kksilo
    cd /cluster/data/droVir1
    maskOutFa -soft downloads/scaffolds.fa \
      bed/simpleRepeat/trfMask.bed scaffolds.fa
    maskOutFa -softAdd scaffolds.fa RMOut/scaffolds.fa.out scaffolds.fa
    # Now clean up the unmasked chunks to avoid confusion later.
    rm -r chunks scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 11/7/04 angie)
    # Translate to 2bit
    ssh kksilo
    cd /cluster/data/droVir1
    faToTwoBit scaffolds.fa droVir1.2bit
    # Make chromInfo.tab.
    mkdir bed/chromInfo
    twoBitInfo droVir1.2bit stdout \
    | awk '{printf "%s\t%s\t/gbdb/droVir1/droVir1.2bit\n", $1, $2;}' \
    > bed/chromInfo/chromInfo.tab

    # Make symbolic a link from /gbdb/droVir1 to the 2bit.
    ssh hgwdev
    mkdir -p /gbdb/droVir1
    ln -s /cluster/data/droVir1/droVir1.2bit /gbdb/droVir1/
    # Load chromInfo table.
    hgsql droVir1 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql droVir1 -e 'load data local infile \
      "/cluster/data/droVir1/bed/chromInfo/chromInfo.tab" into table chromInfo'
    # Make chrom.sizes from chromInfo contents and check scaffold count.
    hgsql droVir1 -N -e 'select chrom,size from chromInfo' \
    > /cluster/data/droVir1/chrom.sizes
    wc -l /cluster/data/droVir1/chrom.sizes
#  27948 /cluster/data/droVir1/chrom.sizes


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE (DONE 11/5/04 angie)
    # Warning: genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
             defaultPos, active, orderKey, genome, scientificName, \
             htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("droVir1", "Jul. 2004", "/gbdb/droVir1", "D. virilis", \
             "scaffold_6:5128001-5158000", 1, 57, \
             "D. virilis", \
             "Drosophila virilis", "/gbdb/droVir1/html/description.html", \
             0, 0, "Agencourt 12 July 2004");' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("D. virilis", "droVir1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/trackDb
    cvsup .

    # Edit trackDb/makefile to add droVir1 to the DBS variable.
    mkdir drosophila/droVir1
    # Create a simple drosophila/droVir1/description.html file.
    cvs add drosophila/droVir1
    cvs add drosophila/droVir1/description.html
    make update DBS=droVir1 ZOO_DBS=

    # go public on genome-test
    cvs ci makefile
    cvs ci drosophila/droVir1
    mkdir /gbdb/droVir1/html
    # in a clean, updated tree's kent/src/hg/makeDb/trackDb:
    make alpha


# PUT SEQUENCE ON /ISCRATCH FOR BLASTZ (DONE 11/5/04 angie)
    # First, agglomerate small scaffolds into chunks of ~1M median 
    # so we don't have too many files for one dir, but keep a reasonably 
    # low job run time:
    ssh kksilo
    cd /cluster/data/droVir1
    mkdir chunks
    faSplit about scaffolds.fa 1000000 chunks/chunk_
    ssh kkr1u00
    mkdir /iscratch/i/droVir1
    cp -pR /cluster/data/droVir1/chunks /iscratch/i/droVir1/
    cp -p /cluster/data/droVir1/droVir1.2bit /iscratch/i/droVir1/
    iSync


# PRODUCING GENSCAN PREDICTIONS (DONE 11/7/04 angie)
    ssh kksilo
    # Make hard-masked scaffolds and split up for processing:
    cd /cluster/data/droVir1
    maskOutFa scaffolds.fa hard scaffolds.fa.masked
    mkdir chunksHardMasked
    faSplit about scaffolds.fa.masked 1000000 chunksHardMasked/chunk_
    mkdir /cluster/data/droVir1/bed/genscan
    cd /cluster/data/droVir1/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    ls -1S ../../chunksHardMasked/chunk*.fa > chunks.list
    cat << '_EOF_' > gsub
#LOOP
gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 chunks.list single gsub jobList
    ssh kki
    cd /cluster/data/droVir1/bed/genscan
    para create jobList
    para try, check, push, check, ...
#Completed: 101 of 101 jobs
#Average job time:                  63s       1.06m     0.02h    0.00d
#Longest job:                      261s       4.35m     0.07h    0.00d
#Submission to last job:           515s       8.58m     0.14h    0.01d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    
    # Concatenate scaffold-level results:
    ssh kksilo
    cd /cluster/data/droVir1/bed/genscan
    cat gtf/*.gtf > genscan.gtf
    cat subopt/*.bed > genscanSubopt.bed
    cat pep/*.pep > genscan.pep
    # Clean up:
    rm -r /cluster/data/droVir1/chunksHardMasked

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/droVir1/bed/genscan
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf droVir1 genscan genscan.gtf
    hgPepPred droVir1 generic genscanPep genscan.pep
    hgLoadBed droVir1 genscanSubopt genscanSubopt.bed

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch droVir1 genscanPep 0501071300.00

# MAKE DOWNLOADABLE FILES (DONE 11/7/04 angie)
# FASTA FIXED 3/21/05 angie, see below
    ssh kksilo
    mkdir /cluster/data/droVir1/zips
    cd /cluster/data/droVir1
    zip -j zips/scaffoldOut.zip RMOut/scaffolds.fa.out
    zip -j zips/scaffoldFa.zip scaffolds.fa
    zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked
    zip -j zips/scaffoldTrf.zip bed/simpleRepeat/trfMask.bed
    foreach f (zips/*.zip)
      echo $f
      unzip -t $f | tail -1
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droVir1
    cd /usr/local/apache/htdocs/goldenPath/droVir1
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cd bigZips
    cp -p /cluster/data/droVir1/zips/*.zip .
    md5sum *.zip > md5sum.txt


# SWAP DM1-DROVIR1 BLASTZ (DONE 11/7/04 angie)
    ssh kksilo
    mkdir -p /cluster/data/droVir1/bed/blastz.dm1.swap.2004-11-07/axtScaffold
    ln -s blastz.dm1.swap.2004-11-07 /cluster/data/droVir1/bed/blastz.dm1
    cd /cluster/data/droVir1/bed/blastz.dm1
    set aliDir = /cluster/data/dm1/bed/blastz.droVir1
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    # With 10k scaffolds, we don't want a directory with one file per 
    # scaffold.  So just make one .axt with everything -- not too huge 
    # anyway, given these little insect genomes.
    zcat $aliDir/axtChrom/chr*.axt.gz \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSort stdin dm1.axt


# CHAIN MELANOGASTER BLASTZ (DONE 11/8/04 angie)
    # Run axtChain on kolossus (one big dm1.axt input)
    ssh kolossus
    mkdir /cluster/data/droVir1/bed/blastz.dm1/axtChain
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    axtChain -verbose=0 ../dm1.axt /cluster/data/droVir1/droVir1.2bit \
      /cluster/data/dm1/nib stdout \
    | chainAntiRepeat /cluster/data/droVir1/droVir1.2bit \
      /cluster/data/dm1/nib stdin stdout \
    | chainMergeSort stdin > all.chain
    # Load chains into database
    ssh hgwdev
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    hgLoadChain -tIndex droVir1 chainDm1 all.chain


# NET MELANOGASTER BLASTZ (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    netClass -noAr noClass.net droVir1 dm1 melanogaster.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    rm noClass.net
    netFilter -syn melanogaster.net > melanogasterSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    netFilter -minGap=10 melanogaster.net |  hgLoadNet droVir1 netDm1 stdin
    netFilter -minGap=10 melanogasterSyn.net \
    | hgLoadNet droVir1 netSyntenyDm1 stdin


# MAKE AXTNET (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    netToAxt melanogaster.net all.chain /cluster/data/droVir1/droVir1.2bit \
        /cluster/data/dm1/nib stdout \
      | axtSort stdin melanogasterNet.axt


# MAKE VSDM1 DOWNLOADABLES (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/droVir1/bed/blastz.dm1/axtChain
    nice gzip *.{chain,net,axt}
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/droVir1/vsDm1
    cd /usr/local/apache/htdocs/goldenPath/droVir1/vsDm1
    cp -p /cluster/data/droVir1/bed/blastz.dm1/axtChain/all.chain.gz \
      melanogaster.chain.gz
    cp -p /cluster/data/droVir1/bed/blastz.dm1/axtChain/melanogaster.net.gz .
    cp -p /cluster/data/droVir1/bed/blastz.dm1/axtChain/melanogasterNet.axt.gz .
    # Make a README.txt which explains the files & formats.
    md5sum *.gz */*.gz > md5sum.txt


# MAKE 11.OOC FILE FOR BLAT (DONE 11/8/04 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is ~4.4% of human judging by gapless dm1 genome size from 
    # featureBits -- we would use 45, but bump that up a bit to be more 
    # conservative).
    ssh kkr1u00
    mkdir /cluster/bluearc/droVir1
    blat /cluster/data/droVir1/droVir1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/droVir1/11.ooc -repMatch=100
#Wrote 14387 overused 11-mers to /cluster/bluearc/droVir1/11.ooc
    cp -p /cluster/bluearc/droVir1/*.ooc /iscratch/i/droVir1/
    iSync


# AUTO UPDATE GENBANK MRNA RUN (DONE 11/22/04 angie)
    ssh hgwdev
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .

    # Edit etc/genbank.conf and add these lines (note scaffold-browser settings):
# droVir1 (D. virilis)
droVir1.genome = /iscratch/i/droVir1/droVir1.2bit
droVir1.mondoTwoBitParts = 1000
droVir1.lift = no
droVir1.refseq.mrna.native.load = no
droVir1.refseq.mrna.xeno.load = yes
droVir1.refseq.mrna.xeno.pslReps = -minCover=0.15 -minAli=0.60 -nearTop=0.005
droVir1.genbank.mrna.xeno.load = yes
# GenBank has no D. virilis ESTs at this point... that may change.
droVir1.genbank.est.native.load = no
droVir1.genbank.est.xeno.load = no
droVir1.downloadDir = droVir1
droVir1.perChromTables = no

    cvs ci etc/genbank.conf
    # Since D. virilis is a new species for us, edit src/lib/gbGenome.c.  
    # Pick some other browser species, & monkey-see monkey-do.  
    cvs diff src/lib/gbGenome.c
    make
    cvs ci src/lib/gbGenome.c
    # Edit src/align/gbBlat to add /iscratch/i/droVir1/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat

    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, (xeno) RefSeq only:
    nice bin/gbAlignStep -srcDb=refseq -type=mrna -initial droVir1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droVir1
    featureBits droVir1 xenoRefGene
#15497250 bases of 189178031 (8.192%) in intersection
    # Clean up:
    rm -rf work/initial.droVir1

    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -srcDb=genbank -type=mrna -initial droVir1 &
    tail -f [its logfile]
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad droVir1
    featureBits droVir1 all_mrna
#51271 bases of 189178031 (0.027%) in intersection
    featureBits droVir1 xenoMrna
#15562690 bases of 189178031 (8.226%) in intersection
    # Clean up:
    rm -rf work/initial.droVir1


# MAKE GCPERCENT (DONE 11/5/04 angie)
    ssh hgwdev
    mkdir /cluster/data/droVir1/bed/gcPercent
    cd /cluster/data/droVir1/bed/gcPercent
    # create and load gcPercent table
    hgGcPercent droVir1 /cluster/data/droVir1


# MAKE HGCENTRALTEST BLATSERVERS ENTRY (DONE 12/?/04 b0b/jen)
    ssh hgwdev
    echo 'insert into blatServers values("droVir1", "blat14", "17784", 1, 0); \
          insert into blatServers values("droVir1", "blat14", "17785", 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE Drosophila Proteins track  (DONE braney 11/17/04)
    ssh kksilo
    mkdir -p /cluster/data/droVir1/blastDb
    cd /cluster/data/droVir1/blastDb
    faSplit sequence ../scaffolds.fa 400 x
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/droVir1/blastDb
    cp /cluster/data/droVir1/blastDb/* /iscratch/i/droVir1/blastDb
    (iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/droVir1/bed/tblastn.dm1FB
    cd /cluster/data/droVir1/bed/tblastn.dm1FB
    ls -1S /iscratch/i/droVir1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kksilo
    cd /cluster/data/droVir1/bed/tblastn.dm1FB
    mkdir fbfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 18735/(150000/396) = 49.460400
    split -l 49 /cluster/data/dm1/bed/blat.dm1FB/dm1FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir blastOut
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
	liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/dm1/protein.lft warn $f.2

        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp 
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 bug.lst fb.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/droVir1/bed/tblastn.dm1FB
    para create blastSpec
    para try, push
# Completed: 151668 of 151668 jobs
# CPU time in finished jobs:    4155662s   69261.03m  1154.35h   48.10d  0.132 y
# IO & Wait Time:                696054s   11600.90m   193.35h    8.06d  0.022 y
# Average job time:                  32s       0.53m     0.01h    0.00d
# Longest job:                    10728s     178.80m     2.98h    0.12d
# Submission to last job:         76564s    1276.07m    21.27h    0.89d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    para create chainSpec

    # should run this on the mini-cluster or with my shove script
    # so you can limit the number of jobs starting to 3 or 4
    para try, push...
# Completed: 383 of 383 jobs
# CPU time in finished jobs:       1220s      20.34m     0.34h    0.01d  0.000 y
# IO & Wait Time:                 10767s     179.44m     2.99h    0.12d  0.000 y
# Average job time:                  31s       0.52m     0.01h    0.00d
# Longest job:                      109s       1.82m     0.03h    0.00d
# Submission to last job:           942s      15.70m     0.26h    0.01d

    exit
    # back to kksilo
    cd /cluster/data/droVir1/bed/tblastn.dm1FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq > /cluster/data/droVir1/bed/tblastn.dm1FB/blastDm1FB.psl

    ssh hgwdev
    cd /cluster/data/droVir1/bed/tblastn.dm1FB

    hgLoadPsl droVir1 blastDm1FB.psl

# End tblastn


# SWAP CHAINS FROM DM2, BUILD NETS ETC. (REDONE 5/23/05 angie)
# Originally done 3/11/05 -- redone (better params) 5/23/05.
    mkdir /cluster/data/droVir1/bed/blastz.dm2.swap
    cd /cluster/data/droVir1/bed/blastz.dm2.swap
    doBlastzChainNet.pl -swap /cluster/data/dm2/bed/blastz.droVir1/DEF \
      >& do.log &
    tail -f do.log
    # Add {chain,net}Dm2 to trackDb.ra if necessary.


# FIX X BASES -> N IN DOWNLOADABLE FASTA FILES (DONE 3/21/05 angie)
    ssh kksilo
    cd /cluster/data/droVir1
    mv scaffolds.fa scaffolds.fa.bak
    tr 'Xx' 'Nn' < scaffolds.fa.bak > scaffolds.fa
    diff scaffolds.fa{.bak,} | less
#<only X/x -> N/n sequence changes>
    grep '^>' scaffolds.fa.bak > /tmp/1
    grep '^>' scaffolds.fa > /tmp/2
    diff /tmp/[12]
#<no output>
    mv scaffolds.fa.masked scaffolds.fa.masked.bak
    tr 'Xx' 'Nn' < scaffolds.fa.masked.bak > scaffolds.fa.masked
    diff scaffolds.fa.masked{.bak,} | less
    grep '^>' scaffolds.fa.masked.bak > /tmp/1
    grep '^>' scaffolds.fa.masked > /tmp/2
    diff /tmp/[12]
    rm zips/scaffoldFa*.zip
    zip -j zips/scaffoldFa.zip scaffolds.fa
    zip -j zips/scaffoldFaMasked.zip scaffolds.fa.masked
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/droVir1/bigZips
    rm scaffoldFa*.zip md5sum.txt
    ln -s /cluster/data/droVir1/zips/scaffoldFa*.zip .
    nice md5sum *.zip > md5sum.txt

# MAKE Drosophila Proteins track (DONE braney 06-30-05)
    ssh kk
    mkdir -p /cluster/data/droVir1/bed/tblastn.dm2FB
    cd /cluster/data/droVir1/bed/tblastn.dm2FB
    ls -1S /iscratch/i/droVir1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst
    mkdir fbfa
    # calculate a reasonable number of jobs 
    calc `wc /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl|awk "{print \\\$1}"`/\(80000/`wc target.lst | awk "{print \\\$1}"`\)
# 18929/(80000/396) = 93.698550

    split -l 94 /cluster/data/dm2/bed/blat.dm2FB/dm2FB.psl fbfa/fb
    cd fbfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S fbfa/*.fa > fb.lst
    mkdir -p /cluster/bluearc/droVir1/bed/tblastn.dm2FB/blastOut  
    ln -s /cluster/bluearc/droVir1/bed/tblastn.dm2FB/blastOut  
    for i in `cat fb.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/dm2/bed/blat.dm2FB/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 $f.3 $f.4
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.3 $f.8 $f.4
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 target.lst fb.lst blastGsub blastSpec

    para create blastSpec
    para push

# Completed: 79992 of 79992 jobs
# CPU time in finished jobs:    1933706s   32228.44m   537.14h   22.38d  0.061 y
# IO & Wait Time:                403514s    6725.23m   112.09h    4.67d  0.013 y
# Average job time:                  29s       0.49m     0.01h    0.00d
# Longest finished job:           17598s     293.30m     4.89h    0.20d
# Submission to last job:         42181s     703.02m    11.72h    0.49d

    ssh kki
    cd /cluster/data/droVir1/bed/tblastn.dm2FB
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/fb?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    para create chainSpec
    para push
 
# Completed: 202 of 202 jobs
# CPU time in finished jobs:       1254s      20.90m     0.35h    0.01d  0.000 y
# IO & Wait Time:                  6632s     110.53m     1.84h    0.08d  0.000 y
# Average job time:                  39s       0.65m     0.01h    0.00d
# Longest finished job:             218s       3.63m     0.06h    0.00d
# Submission to last job:           580s       9.67m     0.16h    0.01d

    cd /cluster/data/droVir1/bed/tblastn.dm2FB/blastOut
    for i in fb??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* > /cluster/data/droVir1/bed/tblastn.dm2FB/blastDm2FB.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/droVir1/bed/tblastn.dm2FB
    hgLoadPsl droVir1 blastDm2FB.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# GENEID PREDICTIONS FROM IMIM (DONE 7/26/05 angie)
    ssh hgwdev
    mkdir /cluster/data/droVir1/bed/geneid
    cd /cluster/data/droVir1/bed/geneid
    wget http://genome.imim.es/genepredictions/D.virilis/golden_path_200407/geneidv1.2/scaffolds.gtf
    ldHgGene -gtf -genePredExt droVir1 geneid *.gtf


