# for emacs: -*- mode: sh; -*-


# This file describes how to make the browser database for the
# worm C. briggsae

###########################################################################
# DOWNLOAD SEQUENCE (DONE, 2005-04-29, hiram)
    ssh kkstore02
    mkdir /cluster/store5/worm/cb2
    cd /cluster/store5/worm/cb2
    mkdir wustl
    cd wustl
    wget "ftp://genome.wustl.edu/private/lhillier/old/cb2.tar.gz" .
    tar --strip-path=1 -xvzf cb2.tar.gz
    faSize cb25.agp8mod.fasta
# 108124579 bases (2974785 N's 105149794 real 105149794 upper 0 lower) in
#	607 sequences in 1 files
    faCount cb25.agp8mod.fasta > contigs.faCount.txt
    grep "^>" cb25.agp8mod.fasta > contig.names
    mkdir contigs
    cd contigs
    faSplit byname ../cb25.agp8mod.fasta .
    #	There was a broken sequence cb25.fpc0071c.fa
    #	in this original file, the corrected sequence was received
    #	from LaDeana Hillier 2005-09-08 and placed into
    #	this contigs directory.  And then, rebuild this
    #	cb25.agp8mod.fasta file:
    cd /cluster/data/cb2/wustl
    mv cb25.agp8mod.fasta cb25.agp8mod.fasta.broken
    gzip cb25.agp8mod.fasta.broken &
    cat contigs/c*.fa > cb25.agp8mod.fasta

    cd /cluster/store5/worm/cb2
    #	Create chrom fasta records, all upper case
    time for A in wustl/chr*.agp
    do
	AGP=`basename $A`
	CHR=${AGP/.agp/}
	echo $AGP $CHR
	mkdir -p ${CHR}
	$HOME/bin/i386/agpToFa -verbose=2 -simpleMulti ${A} ${CHR} \
		stdout wustl/cb25.agp8mod.fasta | \
		tr '[a-z]' '[A-Z]' | \
		sed -e "s/^>CHRUN/>chrUn/; s/^>CHR/>chr/; s/RANDOM/random/" \
			> ${CHR}/${CHR}.fa 
	rm -f ./${CHR}/${AGP}
	ln -s ../wustl/${AGP} ./${CHR}/${AGP}
    done
    #	1m30s
#	A single error  (This has been fixed as mentioned above.)
#	chrI.agp chrI
#       cb25.fpc0071c start:0 end:1956661 seqSize: 1662314
#	1 illegal coordinates found in agp files.  (it was bad sequence,
#	not agp error)
#	Fragment copy is more than available fragment sequence.
#	Artifically fix this until a corrected agp is delivered:
< OK original chrI.agp
> broken chrI.agp
15,16c15,16
< chrI  3303228 5259888 15      W       cb25.fpc0071c   1       1956661 +
< chrI  5259889 5261888 16      N       2000    contig  no
---
> chrI  3303228 4965541 15      W       cb25.fpc0071c   1       1662314
> +
> chrI  4965542 5261888 16      N       294437  contig  no

    faCount chr*/chr*.fa
# #seq    len     A       C       G       T       N       cpg
# chrI    11066658        3330566 2012567 2016656 3341307 365562  369196
# chrII   14273684        4309027 2591324 2597030 4309946 466357  453226
# chrIII  13311297        4030612 2419456 2407118 4032755 421356  432253
# chrIII_random   1149121 348122  206481  205194  348271  41053   36576
# chrII_random    2403442 717077  420926  420733  716887  127819  72445
# chrI_random     3767006 1139118 682120  683610  1145925 116233  123073
# chrIV   15085352        4579142 2700476 2693260 4578635 533839  473913
# chrIV_random    884002  265220  157795  155141  269484  36362   27411
# chrUn   7825149 2272582 1301502 1301660 2278995 670410  226977
# chrV    15759610        4783092 2884362 2881565 4796302 414289  497767
# chrV_random     2980273 910601  539084  535605  914408  80575   93300
# chrX    20107906        6145898 3698455 3673518 6159052 430983  632930
# chrX_random     530426  165643  98518   99104   165320  1841    16683
# total   109143926       32996700        19713066        19670194        330572873706679 3455750

    faToTwoBit chr*/chr*.fa cb2.2bit

    twoBitInfo cb2.2bit stdout |
        awk '{printf "%s\t%s\t/gbdb/cb2/cb2.2bit\n", $1,$2}' \
        > chromInfo.tab
    twoBitInfo cb2.2bit stdout | sort -rn +1 > chrom.sizes

    #	Back on hgwdev to create cluster data symlinks and start database
    ssh hgwdev
    ln -s /cluster/store5/worm/cb2 /cluster/data/cb2
    mkdir /gbdb/cb2
    ln -s /cluster/data/cb2/cb2.2bit /gbdb/cb2

    cd /cluster/data/cb2

    hgsql -e "create database cb2;" mysql
    hgsql -e "create table grp (PRIMARY KEY(NAME)) select * from hg17.grp;" cb2
    hgsql cb2 < $HOME/kent/src/hg/lib/chromInfo.sql
    hgsql -e 'load data local infile "chromInfo.tab" into table chromInfo;' cb2

     # Enter cb2 into dbDb and defaultDb so test browser knows about
     # it:
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("cb2", "Aug 2005", "/gbdb/cb2", "C. briggsae", \
        "chrI:10000-110000", 1, 69, "C. briggsae", \
        "Caenorhabditis briggsae", \
        "/gbdb/cb2/html/description.html", 0, 0, \
        "WUSTL Aug05");' \
        -h localhost hgcentraltest
    #	Update the defaultDb entry
    hgsql -e 'UPDATE defaultDb set name="cb2" where name="cb1";' hgcentraltest

    mkdir html
    ln -s /cluster/data/cb2/html /gbdb/cb2/html

    #	Add cb2 to the trackDb/makefile and an empty directory in
    #	trackDb/worm/cb2
    #	Perform a make in trackDb directory to get the trackDb
    #	initialized for cb2
    ls -d chr* | egrep -v "chromInfo|chromList" >  chromList

###########################################################################
#	Load the gold/gap tables	(DONE - 2005-08-10 - Hiram)
#	Redone - 2005-09-08 - Hiram
    cat chr*/chr*.agp | hgGoldGapGl cb2 stdin
#  !!! *** The indexes do not get built when hgGoldGapGl is run like this
#	To check:
    hgsql -e "show index from gold;" cb2
    hgsql -e "analyze table gold;" cb2
    hgsql -e "show index from gold;" cb2
    hgsql -e "show index from gap;" cb2
    hgsql -e "analyze table gap;" cb2
    hgsql -e "show index from gap;" cb2
    #	The show index after the analyze will show more numbers in the
    #	Cardinality of the index

    featureBits cb2 gap
#	725000 bases of 108418926 (0.669%) in intersection

###########################################################################
#  Prepare scratch area for cluster runs (DONE, 2005-08-09 - Hiram)
#	Redone - 2005-09-08 - Hiram
    ssh hgwdev
    mkdir -p /san/sanvol1/scratch/worms/cb2
    cd /san/sanvol1/scratch/worms/cb2
    mkdir chroms
    cp -p /cluster/data/cb2/chr*/chr*.fa ./chroms
    cp -p /cluster/data/cb2/cb2.2bit .

    ###########################################################################
    # PREPARE Split contigs into 100,000 bp chunks for cluster runs
    #	The chroms are split.  The randoms and chrUn simply use their
    #	fragments as is.  Anytime these fragment results need to be put
    #	back together, use the lift files created from the agp via the
    #	perl scrip as used below.
    # (DONE, 2005-08-05, Hiram)
    #	Redone - 2005-09-08 - Hiram
    # next machine
    ssh kkstore02
    mkdir -p /san/sanvol1/scratch/worms/cb2/split
    cd /san/sanvol1/scratch/worms/cb2
time for C in I II III IV V X
do
    mkdir split/${C}
    faSplit size chroms/chr${C}.fa 100000 split/${C}/${C} \
	-lift=split/chr${C}.lft
done
time for C in I_random II_random III_random IV_random V_random X_random Un
do
    CHR="chr${C}"
    rm -fr split/${C}
    mkdir split/${C}
    grep -v contig /cluster/data/cb2/${CHR}/${CHR}.agp \
	| sed -e "/^$/d" | awk '{print $6}' \
	| while read FN
do
    cp -p /cluster/data/cb2/wustl/contigs/${FN}.fa ./split/${C}
done
    /cluster/data/cb2/scripts/agpToLift.pl \
	/cluster/data/cb2/${CHR}/${CHR}.agp > \
	split/${CHR}.lft
    echo "done with ${CHR}"
done

   cat split/c*I.lft split/c*V.lft split/c*X.lft > liftChroms.lft
   cat split/*_random.lft split/chrUn.lft > liftRandoms.lft

    #	copy the contigs for the randoms back here from the bluearc
    #	for masking and blastz runs
    ssh kkstore02
    cd /cluster/data/cb2
    mkdir randomContigs
    for C in I_random II_random III_random IV_random V_random X_random Un
    do
	rsync -a --progress /san/sanvol1/scratch/worms/cb2/split/${C}/ \
		`pwd`/randomContigs/${C}/
    done

############################################################################
# Run RepeatMasker on the chromosomes (DONE - 2005-08-09 - Hiram)
#	RE-DONE - 2005-09-08 - Hiram
    ssh pk
    cd /cluster/data/cb2
    # make run directory and job list, create the script to use 
    # for the RepeatMasker run
    cat << '_EOF_' > scripts/RMWorm
#!/bin/csh -fe
#
#       This is a slight rearrangement of the
#       RMChicken script used in makeGalGal2.doc
#       The results here need to go to a different location
#       $1 == chrom name: I II III IV V X M
#       $2 == directory where split contig .fa is found
#       $3 == name of contig .fa file
cd $1
pushd .
cd $2
/bin/mkdir -p /tmp/cb2/$3/$1
/bin/cp $3 /tmp/cb2/$3/$1
cd /tmp/cb2/$3/$1
/cluster/bluearc/RepeatMasker050305/RepeatMasker -alignments -s -species elegans $3
popd
/bin/cp /tmp/cb2/$3/$1/$3.out ./
if (-e /tmp/cb2/$3/$1/$3.align) /bin/cp /tmp/cb2/$3/$1/$3.align ./
if (-e /tmp/cb2/$3/$1/$3.tbl) /bin/cp /tmp/cb2/$3/$1/$3.tbl ./
if (-e /tmp/cb2/$3/$1/$3.cat) /bin/cp /tmp/cb2/$3/$1/$3.cat ./
/bin/rm -r /tmp/cb2/$3/$1
/bin/rmdir --ignore-fail-on-non-empty /tmp/cb2/$3
/bin/rmdir --ignore-fail-on-non-empty /tmp/cb2
'_EOF_'
    # emacs happy
    chmod +x scripts/RMWorm
    # create job list
    mkdir RMRun
    rm -f RMRun/jobList
for C in I II III IV V X Un II_random I_random III_random IV_random \
	V_random X_random
do
    mkdir /cluster/data/cb2/RMRun/${C}
    for T in /san/sanvol1/scratch/worms/cb2/split/$C/*.fa
    do
	D=`dirname $T`
	F=`basename $T`
	echo /cluster/data/cb2/scripts/RMWorm ${C} ${D} ${F} \
'{'check out line+ /cluster/data/cb2/RMRun/$C/${F}.out'}'
    done >> RMRun/jobList
done
    # Do the run
    ssh pk
    cd /cluster/data/cb2/RMRun
    para create jobList
    para try, para check, para check, para push, para check, ...
XXX - running - 2005-09-08
Completed: 1369 of 1369 jobs
CPU time in finished jobs:     713871s   11897.85m   198.30h    8.26d  0.023 y
IO & Wait Time:                  9868s     164.47m     2.74h    0.11d  0.000 y
Average job time:                 529s       8.81m     0.15h    0.01d
Longest finished job:           12428s     207.13m     3.45h    0.14d
Submission to last job:         28879s     481.32m     8.02h    0.33d
    #	Looks like the big contigs are the outliers here.  Most
    #	everything else finishes in about 4 minutes.

    # when they are finished, liftUp and load the .out files into the database:
    # next machine
    ssh kkstore02
    cd /cluster/data/cb2/RMRun
for C in I II III IV V X Un II_random I_random III_random IV_random \
	V_random X_random
do
    liftUp chr${C}.fa.out \
	/san/sanvol1/scratch/worms/cb2/split/chr${C}.lft warn ${C}/*.fa.out
done

    cat chrI.fa.out > rmsk.fa.out
    tail +4 chrII.fa.out >> rmsk.fa.out
    tail +4 chrIII.fa.out >> rmsk.fa.out
    tail +4 chrIV.fa.out >> rmsk.fa.out
    tail +4 chrV.fa.out >> rmsk.fa.out
    tail +4 chrX.fa.out >> rmsk.fa.out
    tail +4 chrUn.fa.out >> rmsk.fa.out
    tail --silent --lines=+4 chr*_random.fa.out >> rmsk.fa.out
    ssh hgwdev
    cd /cluster/data/cb2/RMRun
    hgLoadOut -nosplit -verbose=2 cb2 rmsk.fa.out
# bad rep range [480, 441] line 6473 of rmsk.fa.out 
# bad rep range [330, 281] line 34048 of rmsk.fa.out 
# bad rep range [282, 274] line 34050 of rmsk.fa.out 
# bad rep range [333, 268] line 51955 of rmsk.fa.out
# note: 4 records dropped due to repStart > repEnd

    featureBits cb2 rmsk
# 16115005 bases of 108418926 (14.864%) in intersection
    featureBits -countGaps cb2 rmsk
# 16115005 bases of 109143926 (14.765%) in intersection

#######################################################################
# SIMPLE REPEAT [TRF] TRACK  (DONE - 2005-08-10 - Hiram)
#	RE-DONE - 2005-09-09 - Hiram
    ssh kki
    mkdir -p /cluster/data/cb2/bed/simpleRepeat
    cd /cluster/data/cb2/bed/simpleRepeat
    mkdir trf
    ls -1S /san/sanvol1/scratch/worms/cb2/chroms/chr*.fa > genome.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
    #	happy emacs
                                                                                
    gensub2 genome.lst single gsub jobList
    para create jobList
    para try
    #	only 2 CPUs today:
# Completed: 13 of 13 jobs
# CPU time in finished jobs:       1065s      17.75m     0.30h    0.01d  0.000 y
# IO & Wait Time:                   446s       7.43m     0.12h    0.01d  0.000 y
# Average job time:                 116s       1.94m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             408s       6.80m     0.11h    0.00d
# Submission to last job:           756s      12.60m     0.21h    0.01d

    #  When cluster run is done, combine into one:
    sort -k1,1 -k2,2n trf/*.bed > simpleRepeat.bed

    # Load into the database:
    # next machine
    ssh hgwdev
    cd /cluster/data/cb2/bed/simpleRepeat
    hgLoadBed -strict cb2 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    #	Loaded 32690 elements of size 16

    featureBits cb2 simpleRepeat
    #	3977808 bases of 108418926 (3.669%) in intersection

    #	And run trf on the contigs for the separate masking and blastz runs
    ssh kki
    mkdir /cluster/data/cb2/bed/simpleRepeat/randomContigs
    cd /cluster/data/cb2/bed/simpleRepeat/randomContigs
    mkdir trf
    ls -1S /san/sanvol1/scratch/worms/cb2/split/*_random/*.fa \
	/san/sanvol1/scratch/worms/cb2/split/Un/*.fa > contig.lst

    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/x86_64/trfBig -trf=/cluster/bin/i386/trf {check in line+ $(path1)}  /dev/null -bedAt={check out line trf/$(root1).bed} -tempDir=/tmp
#ENDLOOP
'_EOF_'
    #	happy emacs
                                                                                
    gensub2 contig.lst single gsub jobList
    para create jobList
    para try
    #	only 2 CPUs today:
# Completed: 470 of 470 jobs
# CPU time in finished jobs:        414s       6.90m     0.11h    0.00d  0.000 y
# IO & Wait Time:                  2522s      42.03m     0.70h    0.03d  0.000 y
# Average job time:                   6s       0.10m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              73s       1.22m     0.02h    0.00d
# Submission to last job:          1468s      24.47m     0.41h    0.02d

    mkdir -p trfMask
for F in trf/*.bed
do
    T=${F#trf/}
    echo "${F} > trfMask/${T}"
    awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T}
done


#######################################################################
# PROCESS SIMPLE REPEATS AND RMSK INTO MASK (DONE,  2005-08-10 - Hiram)
#	RE-DONE - 2005-09-09 - Hiram
    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore02
    cd /cluster/data/cb2/bed/simpleRepeat
    mkdir -p trfMask
for F in trf/*.bed
do
    T=${F#trf/}
    echo "${F} > trfMask/${T}"
    awk '{if ($5 <= 12) print;}' ${F} > trfMask/${T}
done

#  create Soft and Hard masks from RepeatMaster and TRF outputs:
#  and rebuild the 2bit file using the soft masking in the fa.
#	Might need the nibs for something, so make those too.
    # next machine
    ssh kkstore02
    cd /cluster/data/cb2
    mkdir softMask nib
for C in `cat chromList`
do
    echo -n "masking ${C} "
    rm -f softMask/${C}.fa
    maskOutFa ${C}/${C}.fa RMRun/${C}.fa.out \
                softMask/${C}.fa -soft
    maskOutFa softMask/${C}.fa \
                bed/simpleRepeat/trfMask/${C}.bed \
                softMask/${C}.fa -softAdd
    rm -f nib/${C}.nib
    faToNib -softMask softMask/${C}.fa nib/${C}.nib
done
# masking chrI Writing 11066658 bases in 5533337 bytes
# masking chrII Writing 14273684 bases in 7136850 bytes
# masking chrIII Writing 13311297 bases in 6655657 bytes
# masking chrIII_random Writing 1149121 bases in 574569 bytes
# masking chrII_random Writing 2403442 bases in 1201729 bytes
# masking chrI_random Writing 3767006 bases in 1883511 bytes
# masking chrIV Writing 15085352 bases in 7542684 bytes
# masking chrIV_random Writing 884002 bases in 442009 bytes
# masking chrUn Writing 7825149 bases in 3912583 bytes
# masking chrV Writing 15759610 bases in 7879813 bytes
# masking chrV_random Writing 2980273 bases in 1490145 bytes
# masking chrX Writing 20107906 bases in 10053961 bytes
# masking chrX_random Writing 530426 bases in 265221 bytes

    #	re-create the 2bit file
    rm -f cb2.2bit
    faToTwoBit softMask/chr*.fa cb2.2bit

    # create hard masks 
    mkdir hardMask
for C in `cat chromList`
do
    echo "masking ${C}"
    maskOutFa softMask/${C}.fa hard hardMask/${C}.fa
done

    mkdir randomContigs/softMask randomContigs/hardMask randomContigs/nib
    for C in I_random II_random III_random IV_random V_random X_random Un
    do
	for F in randomContigs/${C}/*.fa
	do
	    B=`basename ${F}`
	    T=${B/.fa/}
	    echo "${B} ${T}"
	    maskOutFa ${F} RMRun/${C}/${B}.out \
                randomContigs/softMask/${B} -soft
	    maskOutFa randomContigs/softMask/${B} \
                bed/simpleRepeat/randomContigs/trfMask/${T}.bed \
                randomContigs/softMask/${B} -softAdd
	    maskOutFa randomContigs/softMask/${B} hard \
		randomContigs/hardMask/${B}
	    faToNib -softMask randomContigs/softMask/${B} \
		randomContigs/nib/${T}.nib
	done
    done
    mkdir randomContigs/nib
    for C in I_random II_random III_random IV_random V_random X_random Un
    do
	for F in randomContigs/${C}/*.fa
	do
	    B=`basename ${F}`
	    T=${B/.fa/}
	    echo "${B} ${T}"
	    faToNib -softMask randomContigs/softMask/${B} \
		randomContigs/nib/${T}.nib
	done
    done
    faToTwoBit softMask/chrI.fa softMask/chrII.fa softMask/chrIII.fa \
	softMask/chrIV.fa softMask/chrV.fa softMask/chrX.fa \
	randomContigs/softMask/*.fa chrRandomContigs.2bit
    #	Check that all the sequence is there:
    twoBitToFa chrRandomContigs.2bit stdout | faSize stdin
    # 108680926 bases (3243679 N's 105437247 real 89174354 upper 16262893 lower) in 476 sequences in 1 files
    faSize softMask/c*.fa
    # 109143926 bases (3706679 N's 105437247 real 89174354 upper 16262893 lower) in 13 sequences in 13 files
    #	Note the real, upper and lower numbers are the same, only the
    #	N's are different


    #	copy to san for cluster runs
    ssh kkstore02
    mkdir -p /san/sanvol1/scratch/worms/cb2/chromNib
    mkdir /san/sanvol1/scratch/worms/cb2/randomContigs
    mkdir /san/sanvol1/scratch/worms/cb2/nib
    cd /cluster/data/cb2/nib
    cp -p c*I.nib c*V.nib c*X.nib /san/sanvol1/scratch/worms/cb2/chromNib
    cp -p c*.nib /san/sanvol1/scratch/worms/cb2/nib
    cd  /cluster/data/cb2
    cp -p cb2.2bit /san/sanvol1/scratch/worms/cb2
    cp -p chrRandomContigs.2bit /san/sanvol1/scratch/worms/cb2
    rsync -a --progress /cluster/data/cb2/randomContigs/softMask/ \
	/san/sanvol1/scratch/worms/cb2/randomContigs/softMask/
    rsync -a --progress /cluster/data/cb2/randomContigs/hardMask/ \
	/san/sanvol1/scratch/worms/cb2/randomContigs/hardMask/

#######################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE 2005-08-17 Hiram)
#	RE-DONE - 2005-09-09 - Hiram
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # this worm size is ~3.7% of human judging by gapless cb2 genome size from 
    # featureBits -- we would use 37, but bump that up a bit to be more 
    # conservative).
    #	featureBits hg17 rmsk
    #	1390952984 bases of 2866216770 (48.529%) in intersection
    #	featureBits cb2 rmsk
    #	16092866 bases of 108124579 (14.884%) in intersection
    #	108124579 / 2866216770 = 0.037723 == 3.7%
    ssh kkstore02
    cd /cluster/data/cb2
    blat cb2.2bit /dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=100
# Wrote 8125 overused 11-mers to 11.ooc
    cp -p 11.ooc /san/sanvol1/scratch/worms/cb2

#######################################################################
# GC5BASE (DONE - 2005-08-10 - Hiram)
#	RE-DONE - 2005-09-09 - Hiram
    ssh kkstore02
    mkdir /cluster/data/cb2/bed/gc5Base
    cd /cluster/data/cb2/bed/gc5Base
    time hgGcPercent -wigOut -doGaps -file=stdout -win=5 cb2 \
        /cluster/data/cb2 | wigEncode stdin gc5Base.wig gc5Base.wib
    #	40 seconds

    ssh hgwdev
    cd /cluster/data/cb2/bed/gc5Base
    mkdir /gbdb/cb2/wib
    ln -s `pwd`/gc5Base.wib /gbdb/cb2/wib
    hgLoadWiggle cb2 gc5Base gc5Base.wig
    rm wiggle.tab

#######################################################################
# sangerGeneWS140 - mapping Ce3 sangerGene onto this sequence
#	(DONE - 2005-08-17 - Hiram)
#	(RE-DONE - 2005-09-09 - Hiram)
#	fetched blast-2.2.11 x86_64 executable from:
#	ftp://ftp.ncbi.nlm.nih.gov/blast/executables/LATEST/
#	Installed in /cluster/bluearc/blast2211x86_64
#	The i386 versions wouldn't work on these machines.
#
    ssh kkstore02
    #	create .fa sequences for blastDb
    mkdir /san/sanvol1/scratch/worms/cb2/blastDb
    cd /san/sanvol1/scratch/worms/cb2/blastDb
for C in I II III IV V X
do
    CHR="chr${C}"
    mkdir ${CHR}
    faSplit size /cluster/data/cb2/${CHR}/${CHR}.fa 100000 ${CHR}/${CHR}_ \
	-lift=${CHR}.lft
done
    #	some of the randomContigs are too large, so split them too
    mkdir randomContigs
for C in I_random II_random III_random IV_random V_random X_random Un
do
    CHR="chr${C}"
    mkdir ${CHR}
    mkdir randomContigs/${CHR}
    grep -v contig /cluster/data/cb2/${CHR}/${CHR}.agp | sed -e "/^$/d" | \
	awk '{print $6}' | while read FN
do
    CTG=${FN#cb25.}
    faSplit size /cluster/data/cb2/wustl/contigs/${FN}.fa \
	100000 ${CHR}/${CTG}_ -lift=randomContigs/${CHR}/${CTG}.lft
done
    cat randomContigs/${CHR}/*.lft > randomContigs/${CHR}.lft
    /cluster/data/cb2/scripts/agpToLift.pl \
	/cluster/data/cb2/${CHR}/${CHR}.agp > ${CHR}.lft
    echo "done with ${CHR}"
done
    #	consolidate the lift files
    cat c*I.lft c*V.lft c*X.lft > liftChroms.lft
    cat c*_random.lft chrUn.lft > liftRandoms.lft
    cat liftChroms.lft liftRandoms.lft randomContigs/*.lft > liftAll.lft

    #	Make all the .fa files exist in one directory
    mkdir fasta
    cd fasta
    ls ../chr*/*.fa | wc
    #	1484    1484   32470
    #	put those 1,484 files together into 10 files to allow for
    #	reasonably sized kluster jobs
    cat ../chr*/*.fa > tmp.fa
    faSplit sequence tmp.fa 10 c_
    rm -f tmp.fa

    #	And construct the blast database
    for i in *.fa
    do
    /cluster/bluearc/blast2211x86_64/bin/formatdb -i $i -p F
    done

    mkdir -p /cluster/data/cb2/bed/tblastn.sangerGene
    cd /cluster/data/cb2/bed/tblastn.sangerGene

    ls -1S /san/sanvol1/scratch/worms/cb2/blastDb/fasta/*.fa > query.lst
    ls -1S /san/sanvol1/scratch/worms/ce3/splitPep/*.fa > pep.lst

    mkdir blastOut
    for i in `cat pep.lst`; do mkdir blastOut/`basename $i .fa`; done

    cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check in line+ $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast2211x86_64/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast2211x86_64/bin/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 /san/sanvol1/scratch/worms/cb2/blastDb/liftAll.lft warn $f.2
        liftUp -nosort -type=".psl" -nohead $f.4 /san/sanvol1/scratch/worms/cb2/blastDb/liftAll.lft carry $f.3
        if pslCheck -prot $f.4
        then
            cp -p $f.4 $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $f.8 $f.3 $f.4
exit 1
'_EOF_'
    #	happy emacs
    chmod +x blastSome
    
    #	Now, to the kluster run
    ssh pk
    cd /cluster/data/cb2/bed/tblastn.sangerGene
    gensub2 query.lst pep.lst gsub jobList
    para create jobList
    para try; push; check ... etc ...
    #	with lots of pk competition:
# Completed: 8170 of 8170 jobs
# CPU time in finished jobs:      89221s    1487.02m    24.78h    1.03d  0.003 y
# IO & Wait Time:                 33532s     558.86m     9.31h    0.39d  0.001 y
# Average job time:                  15s       0.25m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              63s       1.05m     0.02h    0.00d
# Submission to last job:          6937s     115.62m     1.93h    0.08d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
    #	happy emacs

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=7000 stdin ../c.`basename $1`.psl)
'_EOF_'
    #	happy emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/wp???? > chain.lst
    gensub2 chain.lst single chainGsub chainJobs
    para create chainJobs
    para try; push; check ... etc ...
    #	with lots of pk contention
# Completed: 817 of 817 jobs
# CPU time in finished jobs:         46s       0.77m     0.01h    0.00d  0.000 y
# IO & Wait Time:                  3635s      60.58m     1.01h    0.04d  0.000 y
# Average job time:                   5s       0.08m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              13s       0.22m     0.00h    0.00d
# Submission to last job:           266s       4.43m     0.07h    0.00d

    ssh kkstore02 
    cd /cluster/data/cb2/bed/tblastn.sangerGene/blastOut
    for i in wp????
    do
        awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
        sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
        awk "((\$1 / \$11) ) > 0.60 {print}" c60.$i.psl > m60.$i.psl
        echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n \
	| uniq  > ../preblastSangerGene.psl
XXXX
    cd ..
#  haven't tried this yet - 2005-08-12 - should be interesting, need a
#  psl file indicating where the sangerGene's are on Ce3 and alias name list
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl
XXXX

    ssh hgwdev
    cd /cluster/data/cb2/bed/tblastn.sangerGene
    hgLoadPsl -table=blastSangerGene cb2 preblastSangerGene.psl
    
    # clean up
    ssh kkstore02
    cd /cluster/data/cb2/bed/tblastn.sangerGene
    rm -rf blastOut

    #	Is it sane:
    featureBits cb2 blastSangerGene
    #	13676954 bases of 108418926 (12.615%) in intersection

#########################################################################
# BLASTZ C. remanei caeRem1 (WORKING - 2005-09-14 Hiram)
#	
    ssh pk
    mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14

    #	utilizing small target chunks and a single query chunk so that
    #	the dynamic masking with parameter M can function.

    cat << '_EOF_' > DEF
# Cb2 vs caeRem1
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64:/cluster/home/angie/schwartzbin:/parasol/bin

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb2
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ1_2BIT=/san/sanvol1/scratch/worms/cb2/cb2.2bit
SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=1000000
SEQ1_LAP=100

# QUERY: remanei caeRem1
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit
SEQ2_2BIT=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=150000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_14

SEQ1_LEN=$BASE/S1.len
SEQ1_CTGLEN=$BASE/S1ctg.len
SEQ2_LEN=$BASE/S2.len
TMPDIR=/scratch/tmp
'_EOF_'
    # << keep emacs coloring happy

    nibSize /san/sanvol1/scratch/worms/cb2/nib/*.nib \
	| awk '{printf "%s\t%s\n", $2, $3}' > S1.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S1ctg.len
    twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len

    #	establish a screen to control this job
    screen
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_14
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk \
	-stop=blastz \
	`pwd`/DEF > blastz.out 2>&1 &
    #	STARTED 2005-09-14 16:15

#########################################################################
# BLASTZ C. remanei caeRem1 (WORKING - 2005-08-15 Hiram)
#	RE-DONE 2005-09-09 - Hiram
#	
    ssh kk
    mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09

    #	utilizing small target chunks and a single query chunk so that
    #	the dynamic masking with parameter M can function.

    cat << '_EOF_' > DEF
# Cb2 vs caeRem1
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb2 - full chroms only, no randoms
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chromNib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=50

# QUERY: remanei caeRem1
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=150000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_09

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << keep emacs coloring happy

    cp /cluster/data/cb2/chrom.sizes ./S1.len
    twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len

    #	establish a screen to control this job
    screen
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09
    time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-08-15
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:
    ssh kkstore02
    screen -d -r
    #	STARTED - 2005-09-09 15:27
XXXX
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -stop=chainMerge \
	`pwd`/DEF > thruChainMerge.out 2>&1 &

    ##########  And one for the randomContigs
    mkdir /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs

    #	utilizing small target chunks and a single query chunk so that
    #	the dynamic masking with parameter M can function.

    cat << '_EOF_' > DEF
# Cb2 vs caeRem1
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: Cb2 - random contigs only
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/randomContigs/nib
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=50

# QUERY: remanei caeRem1
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=150000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << happy emacs

    nibSize /san/sanvol1/scratch/worms/cb2/randomContigs/nib/*.nib | \
	awk '{printf "%s\t%s\n", $2, $3}' > S1.len
    twoBitInfo /san/sanvol1/scratch/worms/caeRem1/caeRem1.2bit S2.len
    #	establish a screen to control this job
    screen
    cd /cluster/data/cb2/bed/blastzCaeRem1.2005_09_09/randomContigs
    time /cluster/bin/scripts/doBlastzChainNet.pl -stop chainMerge \
	`pwd`/DEF > blast.run.out 2>&1 &
    #	STARTED 2005-09-09 15:40
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=cat -stop=chainMerge \
	`pwd`/DEF > thruChainMerge.out 2>&1 &
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=net \
	`pwd`/DEF > continueNet.out 2>&1 &

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-continue=load -stop=load \
	`pwd`/DEF > load.out 2>&1 &
XXXX

    #	swap results to place cb2 alignments onto cb1
    ssh kkstore02
    cd /cluster/data/cb2/bed/blastzCb1.2005_05_02
    time /cluster/bin/scripts/doBlastzChainNet.pl -swap `pwd`/DEF > \
	swap.run.out 2>&1 &

#############################################################################
# BLAT SERVER SETUP (DONE - 2005-09-09 - Hiram)
    ssh hgwdev
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("cb2", "blat6", 17780, 1, 0);' \
	-h localhost hgcentraltest
    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES("cb2", "blat6", 17781, 0, 1);' \
	-h localhost hgcentraltest

#############################################################################
# BLASTZ Ce3 (WORKING - 2005-09-12 - Hiram)
    ssh kk
    mkdir /cluster/data/cb2/bed/blastzCe3.2005-09-12
    cd /cluster/data/cb2/bed
    ln -s blastzCe3.2005-09-12 blastz.ce3
    cd blastzCe3.2005-09-12

    #	Utilizing tiny target chunks and a single query chunk to use the
    #	dynamic masking available via the BLASTZ_M parameter
    cat << '_EOF_' > DEF
# cb2 vs ce3
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run.v7
BLASTZ=blastz.v7
BLASTZ_H=2000
BLASTZ_M=50
BLASTZ_ABRIDGE_REPEATS=0

# TARGET: briggsae Cb2
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ1_2BIT=/san/sanvol1/scratch/worms/cb2/cb2.2bit
SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=1000000
SEQ1_LAP=100

# QUERY: elegans Ce3
SEQ2_DIR=/san/sanvol1/scratch/worms/ce3/nib
SEQ2_2BIT=/san/sanvol1/scratch/worms/ce3/ce3.2bit
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzCe3.2005-09-12

SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # happy emacs

    twoBitInfo /san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit S1.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \
	| grep random >> S1.len
    twoBitInfo /san/sanvol1/scratch/worms/cb2/cb2.2bit stdout \
	| grep chrUn >> S1.len
    nibSize /san/sanvol1/scratch/worms/ce3/nib/*.nib | \
	awk '{printf "%s\t%s\n", $2, $3}' | sort -rn +1 > S2.len

XXXX - working on modifications to doBlastzChainNet.pl 2005-09-06
XXXX - to do this random contigs business properly

    #	establish a screen to control this job
    screen
    time ./doBlastzChainNet.pl -verbose=2 -stop=cat \
	-bigClusterHub=kk \
	`pwd`/DEF > stopCat.run.out 2>&1 &
    #	STARTED - 2005-09-09 - 16:18
    #	with lots of kluster contention
# Completed: 104 of 104 jobs
# CPU time in finished jobs:     125426s    2090.44m    34.84h    1.45d  0.004 y
# IO & Wait Time:                  1821s      30.35m     0.51h    0.02d  0.000 y
# Average job time:                1224s      20.39m     0.34h    0.01d
# Longest finished job:            2849s      47.48m     0.79h    0.03d
# Submission to last job:         18148s     302.47m     5.04h    0.21d
    time ./doBlastzChainNet.pl -continue cat -stop chainRun \
	-bigClusterHub=kk \
	`pwd`/DEF > toChainRun.run.out 2>&1 &
# Completed: 104 of 104 jobs
# CPU time in finished jobs:         31s       0.52m     0.01h    0.00d  0.000 y
# IO & Wait Time:                   351s       5.84m     0.10h    0.00d  0.000 y
# Average job time:                   4s       0.06m     0.00h    0.00d
# Longest finished job:              12s       0.20m     0.00h    0.00d
# Submission to last job:           211s       3.52m     0.06h    0.00d
    time ./doBlastzChainNet.pl -continue chainRun -stop chainMerge \
	-bigClusterHub=kk \
	`pwd`/DEF > toChainMerge.run.out 2>&1 &
#Completed: 7 of 7 jobs
#CPU time in finished jobs:        347s       5.78m     0.10h    0.00d  0.000 y
#IO & Wait Time:                    65s       1.08m     0.02h    0.00d  0.000 y
#Average job time:                  59s       0.98m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             120s       2.00m     0.03h    0.00d
#Submission to last job:           696s      11.60m     0.19h    0.01d
    time ./doBlastzChainNet.pl -continue chainMerge -stop net \
	-bigClusterHub=kk \
	`pwd`/DEF > toNet.run.out 2>&1 &
    time ./doBlastzChainNet.pl -continue load -stop load \
	-bigClusterHub=kk \
	`pwd`/DEF > loadStep.run.out 2>&1 &
    time ./doBlastzChainNet.pl -continue download -stop cleanup \
	-bigClusterHub=kk \
	`pwd`/DEF > thruCleanup.run.out 2>&1 &
    #	swap results to place ce3 alignments onto cb2
    time ./doBlastzChainNet.pl -swap \
	-bigClusterHub=kk \
	`pwd`/DEF > \
	swap.run.out 2>&1 &
    #	detach from screen session: Ctrl-a Ctrl-d
    #	to reattach to this screen session:

#############################################################################
#  BLASTZ SELF (experiments to test doBlastzChainNet.pl and to test
#  dynamic masking M parameter
    ssh pk
    mkdir /cluster/data/cb2/bed/blastzSelfM50
    cd /cluster/data/cb2/bed/blastzSelfM50

    cat << '_EOF_' > DEF
# cb2 vs cb2
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: elegans Cb2
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/nib
SEQ1_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ1_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes
SEQ1_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=100

# QUERY: briggsae Cb2
SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/nib
SEQ2_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ2_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes
SEQ2_SELF=1
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=110000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzSelfM50
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/cb2/bed/blastzSelfM50

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -stop=load \
	`pwd`/DEF > swapLoadReady.out 2>&1 &
# Completed: 231 of 231 jobs
# CPU time in finished jobs:     336879s    5614.65m    93.58h    3.90d  0.011 y
# IO & Wait Time:                 14146s     235.77m     3.93h    0.16d  0.000 y
# Average job time:                1520s      25.33m     0.42h    0.02d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:           29772s     496.20m     8.27h    0.34d
# Submission to last job:         38823s     647.05m    10.78h    0.45d

# Completed: 231 of 231 jobs
# CPU time in finished jobs:        171s       2.85m     0.05h    0.00d  0.000 y
# IO & Wait Time:                  1070s      17.83m     0.30h    0.01d  0.000 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:              11s       0.18m     0.00h    0.00d
# Submission to last job:            80s       1.33m     0.02h    0.00d

# Completed: 45 of 45 jobs
# CPU time in finished jobs:       4293s      71.54m     1.19h    0.05d  0.000 y
# IO & Wait Time:                   268s       4.47m     0.07h    0.00d  0.000 y
# Average job time:                 101s       1.69m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1605s      26.75m     0.45h    0.02d
# Submission to last job:          1639s      27.32m     0.46h    0.02d

    # Crashed on one of the net steps, fix the script and continue:
    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -continue=net -stop=load \
	`pwd`/DEF > netToLoad.out 2>&1 &
    #	real    38m41.919s
    #	user    0m0.049s
    #	sys     0m0.041s

    #	Re-Load the tables under a special name:
    ssh hgwdev
    cd /cluster/data/cb2/bed/blastzSelfM50/axtChain
    hgLoadChain -tIndex cb2 M50chainSelf cb2.cb2.all.chain.gz
    #	Loading 5934642 chains into cb2.M50chainSelf
    netFilter -minGap=10 cb2.cb2.net \
	| hgLoadNet -verbose=0 cb2 M50netSelf stdin
    # real    234m14.863s
    # user    4m57.370s
    # sys     2m34.100s
    time featureBits cb2 M50chainSelfLink
    #	57461471 bases of 108418926 (52.999%) in intersection

#############################################################################
#  BLASTZ SELF (experiments to test doBlastzChainNet.pl and to test
#  dynamic masking M parameter
#	Repeat experiment with M=1000
    ssh pk
    mkdir /cluster/data/cb2/bed/blastzSelfM1000
    cd /cluster/data/cb2/bed/blastzSelfM1000

    cat << '_EOF_' > DEF
# cb2 vs cb2
export
PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/x86_64

BLASTZ=blastz.v7.x86_64
BLASTZ_H=2000
BLASTZ_M=1000

# TARGET: elegans Cb2
SEQ1_DIR=/san/sanvol1/scratch/worms/cb2/nib
SEQ1_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ1_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ1_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes
SEQ1_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=500000
SEQ1_LAP=100

# QUERY: briggsae Cb2
SEQ2_DIR=/san/sanvol1/scratch/worms/cb2/nib
SEQ2_CTGDIR=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.2bit
SEQ2_LIFT=/san/sanvol1/scratch/worms/cb2/liftRandoms.lft
SEQ2_LEN=/san/sanvol1/scratch/worms/cb2/chrom.sizes
SEQ2_CTGLEN=/san/sanvol1/scratch/worms/cb2/chrRandomContigs.sizes
SEQ2_SELF=1
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=110000000
SEQ2_LAP=0

BASE=/cluster/data/cb2/bed/blastzSelfM1000
TMPDIR=/scratch/tmp
'_EOF_'
    #	happy emacs

    cd /cluster/data/cb2/bed/blastzSelfM1000

    time $HOME/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -stop=load \
	`pwd`/DEF > swapLoadReady.out 2>&1 &
    featureBits cb2 chainSelf
    #	108554651 bases of 108418926 (100.125%) in intersection

    #	Load the tables under a special name:
    ssh hgwdev
cd /cluster/data/cb2/bed/blastzSelfM1000/axtChain/chain
foreach f (*.chain)
    set c = $f:r
    hgLoadChain cb2 ${c}_M1000chainSelf $f
end
cd /cluster/data/cb2/bed/blastzSelfM1000/axtChain

# Load nets:
netFilter -minGap=10 cb2.cb2.net \
| hgLoadNet -verbose=0 cb2 M1000netSelf stdin
    #	real    181m56.670s
    #	user    23m58.570s
    #	sys     6m56.920s

   #	featureBits runs out of memory on this on hgwdev 
    ssh kolossus
    cd /cluster/data/cb2/bed/blastzSelfM1000
    time HGDB_CONF=~/.hg.conf.read-only featureBits cb2 \
	M1000chainSelfLink > fbCb2.M1000chainSelfLink 2>&1
    # 73156472 bases of 108418926 (67.476%) in intersection
    #	real    42m33.104s
    #	user    15m42.360s
    #	sys     5m35.500s


    #	The comparison numbers:

    #	pk kluster runs, x86_64 blastz binary
    #	Target chunk size of 500,000 overlap 100
    #	Query chunk size, whole genome = 110,000,000
    #	231 kluster jobs
    #
    #   BLASTZ_M parameter          M=50                  M=1000
    #	Average job time:             25 min                  85 min
    #   Hippos out to:               8.3 hr                  7.4 hr
    #   chainSelf table size   5,934,642 rows         27,659,493 rows
    #                            527,006 Kb            2,340,000 Kb
    #   chainSelfLink table   33,604,807 rows        183,315,214 rows
    #                              1,695 Mb                7,639 Mb
    #   netSelf table            274,211 rows            314,441 rows
    #                             28,967 Kb               33,345 Kb
    #   featureBits SelfLink   5,761,471 bases        73,156,472 bases
    #                           % 52.999 intersection   % 67.476 intersection

###########################################################################
#	Preparing downloads
    ssh kkstore02
    mkdir /cluster/data/cb2/goldenPath
    cd /cluster/data/cb2/softMask
    mkdir ../goldenPath/bigZips
    tar cvzf ../goldenPath/bigZips/chromFa.tar.gz ./chr*.fa
    
    mkdir /cluster/data/cb2/goldenPath/bigZips
    cp -p ../../RMRun/rmsk.fa.out ./rmsk.out
    gzip rmsk.out
    mkdir /cluster/data/cb2/goldenPath/chromosomes
    cd /cluster/data/cb2/goldenPath/chromosomes
    cp -p ../../softMask/chr*.fa .
    gzip chr*.fa

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/cb2/bigZips
    cd /usr/local/apache/htdocs/goldenPath/cb2/bigZips
    ln -s /cluster/data/cb2/goldenPath/bigZips/* .
    mkdir /usr/local/apache/htdocs/goldenPath/cb2/chromosomes
    cd /usr/local/apache/htdocs/goldenPath/cb2/chromosomes
    ln -s /cluster/data/cb2/goldenPath/chromosomes/chr*.fa.gz .

