# for emacs: -*- mode: sh; -*-


# This file describes browser build for the mouse
# genome, April 2007, ncbi mouse_37 - Mm9
#
#	"$Id: mm9.txt,v 1.138 2010/06/10 16:32:49 chinhli Exp $"
#

#######################################################################
# DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE - 2007-04-05 - Hiram)
#
#	Examine disk space issues, find some goodly amount of space
    ssh kkstore02
    mkdir /cluster/store5/mm9
    ln -s /cluster/store5/mm9 /cluster/data/mm9
    cd /cluster/data/mm9
    ## After testing with the pre-release below, the real thing begins here
    mkdir mouse_37
    cd mouse_37
    ## Ouch, the files are no longer delivered conveniently in a single
    ## directory.  They are in several locations now ...
    
NCBI=ftp://ftp.ncbi.nih.gov/genomes
MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/initial_release
for F in README README_CURRENT_BUILD
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/${F}" -O ${F}
done
for F in allcontig.agp.gz seq_contig.md.gz ideogram.gz
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/${MAPVIEW}/${F}" -O ${F}
done
    # survey the strains contained in seq_contig.md.gz
    zcat seq_contig.md.gz | awk '{print $9}' | sort | uniq -c | sort -rn
  13075 Celera
    360 C57BL/6J
    101 129/SvJ
     93 129/Sv
     79 unknown
     75 129/SvEvTac
     40 NOD
     26 129S7/SvEv
     14 129/Ola
      7 129
      6 Cast/Ei
      6 BALB/c
      3 SJL/J
      3 C3H
      3 B6/CBAF1J
      3 AKR/J
      3 A/J
      2 Spret/Ei
      1 group_label
      1 129/J
    # we will work on the C57BL/6J strain

mkdir -p chrAgp
cd chrAgp
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.agp.gz" \
        -O chr${C}.agp.gz
done

cd ..
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
    zcat chrAgp/chr${C}.agp.gz | grep "^c"
done > chrOnly.agp

mkdir -p chrfasta
cd chrfasta
for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/Assembled_chromosomes/mm_ref_chr${C}.fa.gz" \
        -O chr${C}.fa.gz
done

cd ..
mkdir chrUn
mkdir chrM
wget  --dont-remove-listing --timestamping \
	"${NCBI}/M_musculus/CHR_Un/mm_ref_chrUn.fa.gz -O chrUn/chrUn.fa.gz
wget  --dont-remove-listing --timestamping \
	"${NCBI}/M_musculus/CHR_MT/mm_ref_chrUn.fa.gz \
	-O chrM/mm_ref_chrUn.fa.gz

mkdir contigFasta
for C in 1 2 3 4 5 6 7 8 9
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/CHR_0${C}/mm_ref_chr${C}.fa.gz" \
        -O contigFasta/chr${C}.fa.gz
done
for C in 0 1 2 3 4 5 6 7 8 9
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/CHR_1${C}/mm_ref_chr1${C}.fa.gz" \
        -O contigFasta/chr1${C}.fa.gz
done
for C in X Y Un MT
do
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/M_musculus/CHR_${C}/mm_ref_chr${C}.fa.gz" \
        -O contigFasta/chr${C}.fa.gz
done
    mv contigFasta/chrMT.fa.gz contigFasta/chrM.fa.gz
    ## split up the contigFasta files into their individual contigs
    ## the sed fixes the fasta header name to just be the contig name
    mkdir splitContigs
for F in contigFasta/chr*.fa.gz
do
    BN=`basename ${F}`
    C=${BN/.fa.gz/}
    echo $F $BN $C
    echo -n "${C} working ... "
    mkdir -p splitContigs/${C}
    zcat ${F} | sed -e "s/.*ref|/>/; s/|.*//" \
	| faSplit byname stdin splitContigs/${C}/
    echo "done"
done
    ## create agp files for the randoms from seq_contig.md and allcontig.agp
    ## both fragment and contig agp files
    $HOME/kent/src/hg/mouseStuff/buildTools/seqContigToAgp.pl \
	randomFragments.agp randomContigs.agp 2> randomContigs.err
    ## create contig agp file for non-randoms
    $HOME/kent/src/hg/mouseStuff/buildTools/mkContigAgp.pl allContigs.agp
    ## combine the two contig agp files
    cat allContigs.agp randomContigs.agp > mm9.contigs.agp
    ## separate the random contigs from the non-random contigs
    $HOME/kent/src/hg/mouseStuff/buildTools/sortRandoms.pl \
	randomContigs.agp > mvRandoms.sh
    ## inspect mvRandoms.sh and then run it if it is OK
    chmod +x mvRandoms.sh
    ./mvRandoms.sh
    ## verify all contigs exist properly
    $HOME/kent/src/hg/mouseStuff/buildTools/checkContigs.pl mm9.contigs.agp
    ## create all contigs fasta file
    cd splitContigs
    find . -type f | xargs cat > ../mm9.contigs.fa
    ## create assembled sequence from these contigs and agp file
    cd ..
    agpToFa -simpleMulti mm9.contigs.agp all mm9.assembled.fa mm9.contigs.fa
    ## create fragments agp file
    cat chrOnly.agp randomFragments.agp > mm9.fragments.agp
    ## verify this agp too will work with the assembled fasta
    ## need 2bit file to avoid fasta file ordering difficulty
    faToTwoBit mm9.assembled.fa mm9.assembled.2bit
    checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
    ## it has a problem with chrY because it is supposed to end with:
chrY    2902556 5902555 29      N       3000000 centromere      no
chrY    5902556 15902555        30      N       10000000        contig  no
    ## edit mm9.contigs.agp to add these two lines, and repeat the agpToFa
    ## after that, this check fails on chrX_random
    ## this is supposed to be a gap, with N's
    ## chrX_random     300319  303472  46      N       3154  fragment  yes
# Loop: chrX_random, dnaOffset=300318, seqSize=1785075
# agpFrag->chromStart: 300318, agpFrag->chromEnd: 303472, dnaOffset: 300318
# FASTA gap entry
# Bad char a found at index 300349
# Invalid Agp or Fasta file entry for sequence chrX_random
# agpMatchesFaEntry failed; exiting
    ## this comes from the use of a single fragment in two parts,
    ## from allcontig.agp
NT_165789.2     296206  300318  45      W       CAAA01187194.1  1       4113  +
NT_165789.2     300319  300349  46      N       31      fragment        no      
NT_165789.2     300350  303372  47      W       CAAA01187194.1  4145    7167
    ## which I processed into:
chrX_random     296206  300318  45      W       CAAA01187194.1  1       4113  +
chrX_random     300319  303472  46      N       3154    fragment        yes
    ## should have been
chrX_random     296206  300318  45      W       CAAA01187194.1  1       4113  +
chrX_random     300319  300349  46      N       31      fragment        yes
chrX_random     300350  303372  47      W       CAAA01187194.1  4145    7167  +
### NCBI had this as a non-bridged fragment, a 'no' - I'm making it a yes
    ## so, edit the randomFragments.agp to fixup that line as indicated
    ## the chrOnly.agp file also needs an entry for chrM, add this
    ## line to chrOnly.agp:
chrM    1       16299   1       F       NC_005089.1     1       16299   +
    ## now have successful business:
    checkAgpAndFa mm9.fragments.agp mm9.assembled.2bit
# All AGP and FASTA entries agree - both files are valid
    ## let's get the sequence in order in the fasta file
    faSplit byname mm9.assembled.fa splitChr/
    cut -f1 mm9.fragments.agp | uniq -c
    ## using the order of this fragments.agp file
    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y M \
	13_random 16_random 17_random 1_random 3_random 4_random 5_random \
	7_random 8_random 9_random Un_random X_random Y_random
do
    cat splitChr/chr${C}.fa
done > mm9.fragorder.assembled.fa
    ## now that fasta file should also be OK
    checkAgpAndFa mm9.fragments.agp mm9.fragorder.assembled.fa
# All AGP and FASTA entries agree - both files are valid
    ## now ready to give this agp and fasta file off to makeGenomeDb.pl

    ## pre-release testing download sequence  ###############################
    mkdir ncbi
    cd ncbi
    cp -p /cluster/data/mm8/ncbi/.wgetrc .
    WGETRC=`pwd`/.wgetrc
    export WGETRC

    time nice -n +19 wget --timestamping --force-directories \
	--directory-prefix=. --dont-remove-listing --recursive \
	--level=4 --no-parent --no-host-directories --cut-dirs=1 \
	ftp://ftp-private.ncbi.nih.gov/mouse_37
    #	Downloaded: 2,599,733,765 bytes in 196 files

    #	The pre-release sequence, April 5th:
    mkdir /cluster/data/mm9/pre_release
    cd /cluster/data/mm9/pre_release
    #	The .wgetrc is the anonymous user
    cat << '_EOF_' > .wgetrc
login = anonymous
passwd = <your email address>
'_EOF_'
    # << happy emacs
    chmod 600 .wgetrc
    WGETRC=`pwd`/.wgetrc
    export WGETRC
    wget --timestamping --force-directories --directory-prefix=. \
	--dont-remove-listing --recursive --level=4 --no-parent \
	--no-host-directories --cut-dirs=3 \
	ftp://ftp.ncbi.nih.gov/genomes/M_musculus/pre_release
    ##	Ran a quick test build with that to see if it would work

    ### this procedure run for the pre_release and the mouse_37 sequence
    ### for pre_release the sed was:
    # zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>lcl|/>/; s/.fa.*//"
    mkdir chrNamesFixed
    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y
do
    zcat chrfasta/chr${C}.fa.gz | sed -e "s/^>gi.*/>chr${C}/" \
	| gzip -c > chrNamesFixed/chr${C}.fa.gz
    echo chr${C} done
done
    zcat chrM/mm_ref_chrMT.fa.gz | sed -e "s/^>gi.*/>chrM/" \
	| gzip -c > chrNamesFixed/chrM.fa.gz

    ## later on, an error was discovered in the processing of chrY_random
    # a lot of gaps of size zero were inserted.  They didn't cause any
    # disruption to the assembly track, they only caused extra gap entries
    # that were useless.  So, to fixup, remove anything in the chrY_gap
    # table that has a size of zero:
    hgsql -e 'delete from chrY_random_gap where size<"1";' mm9

    ## And, fixing the one fragment on chrX_random
    hgsql -e 'INSERT chrX_random_gap VALUES("587", "chrX_random",
"300318", "300349", "46", "N", "31", "fragment", "yes")' mm9

    hgsql -e 'DELETE from chrX_random_gold where chromStart="296205";' mm9
    hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
"296205", "300318", "45", "W", "CAAA01187194.1", "0", "4113", "+")' mm9
    hgsql -e 'INSERT chrX_random_gold VALUES("587", "chrX_random",
"300349", "303372", "45", "W", "CAAA01187194.1", "4144", "7167", "+")' mm9


##########################################################################
## final makeGenomeDb.pl (DONE - 2007-07-19 - Hiram)
    ## to make this go again, some things need to be removed or set-aside

    ssh hgwdev
    hgsql -e 'delete from dbDb where name="mm9";' hgcentraltest
    rm -fr /gbdb/mm9

    ssh kkstore06
    cd /cluster/data/mm9
    mv mm9.config.ra mm9.config.pre_release.ra
    mv bed bed.pre_release
    mv mm9.unmasked.2bit mm9.unmasked.2bit.pre_release
    mv mm9.agp mm9.agp.pre_release
    mv mm9.randoms.2bit mm9.randoms.2bit.pre_release
    mv mm9.rmsk.2bit mm9.rmsk.2bit.pre_release
    mv mm9.rmskTrf.2bit mm9.rmskTrf.2bit.pre_release
    rm mm9.2bit
    rm -fr ? ??
    mv dbDbInsert.sql dbDbInsert.sql.pre_release
    mv makeGenomeDb.out makeGenomeDb.out.pre_release
    mv chrom.lst chrom.lst.pre_release
    mv jkStuff jkStuff.pre_release
    ## ask cluster-admin to rename the existing mm9 db to be mm9prerelease

    cat << '_EOF_' > mm9.config.ra
# Config parameters for makeGenomeDb.pl:
db mm9
scientificName Mus musculus
commonName Mouse
assemblyDate Jul. 2007
assemblyLabel NCBI Build 37
orderKey 121
mitoAcc none
fastaFiles /cluster/data/mm9/mouse_37/mm9.fragorder.assembled.fa
agpFiles /cluster/data/mm9/mouse_37/mm9.fragments.agp
# qualFiles /dev/null
dbDbSpeciesDir mouse
'_EOF_'
    # << happy emacs
    time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
    #	real    24m24.468s

    ssh hgwdev
    featureBits mm9 gold
# 2620346158 bases of 2620346158 (100.000%) in intersection
    featureBits mm8 gold
# 2567283971 bases of 2567283971 (100.000%) in intersection
    featureBits mm9 gap
# 105419323 bases of 2620346158 (4.023%) in intersection
    featureBits mm8 gap
# 97171117 bases of 2567283971 (3.785%) in intersection

    #	verify index is correct:
    hgsql mm9 -e "show index from gc5Base;"
    #	should see good numbers in Cardinality column

    #	Reset default position to be like Mm8
    hgsql -e \
'update dbDb set defaultPos="chr12:50258170-50263946" where name="mm9";' \
	hgcentraltest

    # create initial symlink for 2bit sequence
    mkdir /gbdb/mm9
    mkdir /gbdb/mm9/html
    ln -s /cluster/data/mm9/mm9.unmasked.2bit /gbdb/mm9/mm9.2bit

    ## enter the trackDb business (was done in the pre-release test)

##########################################################################
## Initial pre-release makeGenomeDb.pl (DONE - 2007-04-05 - Hiram)
    ssh kkstore02
    cd /cluster/data/mm9
    cat << '_EOF_' > mm9.config.ra
# Config parameters for makeGenomeDb.pl:
db mm9
scientificName Mus musculus
commonName Mouse
assemblyDate Apr. 2007
assemblyLabel NCBI Build 37
orderKey 121
mitoAcc 33115104
fastaFiles /cluster/data/mm9/pre_release/chrNamesFixed/chr*.fa.gz
agpFiles /cluster/data/mm9/pre_release/chrOnly.agp
# qualFiles /dev/null
dbDbSpeciesDir mouse
'_EOF_'
    # << happy emacs
    time nice -n +19 makeGenomeDb.pl mm9.config.ra > makeGenomeDb.out 2>&1 &
    #	real    24m24.468s

##########################################################################
## Repeat masker (DONE - 2007-04-05 - Hiram)
## 	RE-DONE with final sequence 2007-07-19 - Hiram
    ssh kkstore06
    ## use screen for this
    mkdir /cluster/data/mm9/bed/RepeatMasker
    cd /cluster/data/mm9/bed/RepeatMasker
    time nice -n +19 doRepeatMasker.pl -bigClusterHub=kk \
	-buildDir=/cluster/data/mm9/bed/RepeatMasker mm9 > do.out 2>&1 &
    #	real    1726m32.849s
# Completed: 5467 of 5467 jobs
# CPU time in finished jobs:   54774630s  912910.50m 15215.17h  633.97d  1.737 y
# IO & Wait Time:                432302s    7205.04m   120.08h    5.00d  0.014 y
# Average job time:               10098s     168.30m     2.81h    0.12d
# Longest finished job:           20982s     349.70m     5.83h    0.24d
# Submission to last job:        100294s    1671.57m    27.86h    1.16d

    ssh kkstore06
    cd /cluster/data/mm9
    twoBitToFa mm9.rmsk.2bit stdout | faSize stdin
# 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
#	1153701322 lower) in 35 sequences in 1 files
# %42.33 masked total, %44.03 masked real

##############################################################################
## simpleRepeat masking (DONE - 2007-04-07 - Hiram)
##	RE-DONE with final sequence 2007-07-19 - Hiram
    ssh kolossus
    ## use screen for this
    mkdir /cluster/data/mm9/bed/simpleRepeat
    cd /cluster/data/mm9/bed/simpleRepeat
    time nice -n +19 twoBitToFa ../../mm9.unmasked.2bit stdout \
	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
		-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
    #	real    253m44.602s
    #	Appears to have an error on something:
# sh: line 1: 18346 File size limit exceeded/cluster/bin/i386/trf /scratch/tmp/stdin_kolossus_3af1_fe9700.tf 2 7 7 80 10 50 2000 -m -d
# Expecting 14 words line 4593 of /scratch/tmp/stdin_kolossus_3af1_fe9700.tf.2.7.7.80.10.50.2000.dat got 1

    #	Let's try running this on the kki kluster, by chrom
    ssh kkr1u00
    mkdir /iscratch/i/mus/mm9
    cd /iscratch/i/mus/mm9
    cp -p /cluster/data/mm9/mm9.unmasked.2bit .
    cp -p /cluster/data/mm9/chrom.sizes .
    cut -f1 chrom.sizes | while read C
do
  twoBitToFa -noMask -seq=${C} mm9.unmasked.2bit stdout | gzip -c > ${C}.fa.gz
  echo ${C}
done

    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress /iscratch/i/mus/mm9/ kkr${R}u00:/iscratch/i/mus/mm9/
done

    ssh kki
    mkdir /cluster/data/mm9/bed/simpleRepeat/trf
    cd /cluster/data/mm9/bed/simpleRepeat/trf

    cat << '_EOF_' > runTrf
#!/bin/csh -fe 
#
set C = $1
set GZ = /iscratch/i/mus/mm9/$C.fa.gz
mkdir -p /scratch/tmp/$C
zcat $GZ > /scratch/tmp/$C/$C.fa
pushd /scratch/tmp/$C
/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $C.fa \
	/dev/null -bedAt=$C.bed -tempDir=/scratch/tmp/$C
popd
rm -f $C.bed
cp -p /scratch/tmp/$C/$C.bed .
rm -fr /scratch/tmp/$C
'_EOF_'
    # << happy emacs
    chmod +x runTrf

    cat << '_EOF_' > template
#LOOP
./runTrf $(path1) {check out line $(root1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    cut -f1 /iscratch/i/mus/mm9/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    para try ... check ... push ... etc ...
    ## none of these jobs and any trouble, running line counts of these result
    ## bed files with the previous failed run indicates there are identical
# Completed: 35 of 35 jobs
# CPU time in finished jobs:      14620s     243.66m     4.06h    0.17d  0.000 y
# IO & Wait Time:                   272s       4.54m     0.08h    0.00d  0.000 y
# Average job time:                 425s       7.09m     0.12h    0.00d
# Longest finished job:            1386s      23.10m     0.39h    0.02d
# Submission to last job:          1790s      29.83m     0.50h    0.02d

    cat *.bed > ../simpleRepeat.bed
    cd ..
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    ssh hgwdev
    cd /cluster/data/mm9/bed/simpleRepeat
    time nice -n +19 hgLoadBed mm9 simpleRepeat \
      simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    #	Loaded 1167619 elements of size 16
    #	real    0m33.312s

    nice -n +19 featureBits mm9 simpleRepeat
    #	80054947 bases of 2620346158 (3.055%) in intersection

    ## clean up the /iscratch/i/mus/mm9/ directory
    ## for downloads:
    mkdir trfMaskChrom
    cd trfMaskChrom
    ln -s ../trf/chr*.bed .

###########################################################################
# CREATE MICROSAT TRACK (DONE - 2007-07-20 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/microsat
    cd /cluster/data/mm9/bed/microsat
    awk '($5==2 || $5==3) && $6 >= 15 && $8 == 100 && $9 == 0 {printf("%s\t%s\t%s\t%dx%s\n", $1, $2, $3, $6, $16);}' \
	../simpleRepeat/simpleRepeat.bed > microsat.bed 
    hgLoadBed mm9 microsat microsat.bed
    #	Loaded 195688 elements of size 4

    featureBits mm9 microsat
# 8713212 bases of 2620346158 (0.333%) in intersection
    featureBits mm8 microsat
# 8570611 bases of 2567283971 (0.334%) in intersection

#############################################################################
# PROCESS SIMPLE REPEATS INTO MASK (DONE - 2007-07-21 - Hiram)

    # After the simpleRepeats track has been built, make a filtered version
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore06
    cd /cluster/data/mm9/bed/simpleRepeat
    mkdir trfMask
    for F in trf/chr*.bed
    do
	echo "${F} -> ${F/trf\//}"
	awk '{if ($5 <= 12) print;}' ${F} > trfMask/${F/trf\//}
    done

    ## Add trfMask to repeat masked sequence
    ssh kkstore06
    cd /cluster/data/mm9
    cat  << '_EOF_' > addTrf.csh
#!/bin/csh -efx
# This script will fail if any of its commands fail.

set DB = mm9
set WORK_DIR = /cluster/data/${DB}
cd ${WORK_DIR}
set inputTwoBit = ${WORK_DIR}/${DB}.rmsk.2bit
set outputTwoBit = ${WORK_DIR}/${DB}.rmskTrf.2bit
cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
        | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${outputTwoBit}
twoBitToFa ${outputTwoBit} stdout | faSize stdin > faSize.${DB}.rmskTrf.txt
'_EOF_'
    # << happy emacs
    chmod +x ./addTrf.csh
    time ./addTrf.csh
    cat faSize.mm9.rmskTrf.txt
# 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
#	1155308080 lower) in 35 sequences in 1 files
# %42.38 masked total, %44.09 masked real

    ln -s mm9.rmskTrf.2bit mm9.2bit
    # fixup /gbdb/mm9/mm9.2bit symlink to this newly masked sequence

    ## copy to san for genbank kluster run
    cd /cluster/data/mm9
    cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-04-09 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm9", "blat14", "17790", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("mm9", "blat14", "17791", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

########################################################################
##  CYTOBAND - ideogram track (DONE - 2007-08-15 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/cytoBand
    cd /cluster/data/mm9/bed/cytoBand

    # Create bed file
    # (this script fixed up to eliminate one of the lines from ideogram file)
    $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ../../mouse_37/ideogram
    ### doesn't work, the ideogram file is corrupted, use the one fetched below
    ## as so:
    $HOME/kent/src/utils/ncbi/createNcbiCytoBand.pl ideogram
    ## can now verify before load:
    $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
    #	everything checks out OK on 21 chroms
    # Load the bed file
    hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/cytoBand.sql \
	mm9 cytoBand cytoBand.bed
    # Make cytoBandIdeo track for ideogram gif on hgTracks page.
    # For mouse cytoBandIdeo is just a replicate of the cytoBand track.
    hgsql -e "drop table cytoBandIdeo;" mm9
    hgsql mm9 -e "create table cytoBandIdeo (index(chrom(10),chromStart)) as select * from cytoBand;"

    ## fetch updated ideogram.gz file that has been fixed by NCBI
    NCBI=ftp://ftp.ncbi.nih.gov/genomes
    MAPVIEW=MapView/Mus_musculus/sequence/BUILD.37.1/updates
    wget  --dont-remove-listing --timestamping \
        "${NCBI}/${MAPVIEW}/ideogram.gz" -O ideogram.gz
    ## run through the createNcbiCytoBand.pl process above, and then load
    ## can now verify before load:
    $HOME/kent/src/utils/ncbi/cytoBandVerify.pl
    #	everything checks out OK on 21 chroms

##########################################################################
## GENBANK alignments (DONE - 2007-08-03 - Hiram)
    ## next time:  don't forget to make the 11.ooc file, see below
    ## generate a lift file that specifies segments separated by non-bridged
    ## gaps
    ## make the ooc file
    ssh kolossus
    cd /cluster/data/mm9
    time blat mm9.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=11.ooc -repMatch=912
    #	real    2m29.455s
    cp -p 11.ooc /san/sanvol1/scratch/mm9
    cp -p 11.ooc jkStuff
    ## also setup /iscratch/i/mus/mm9/ with these files for
    ## other kluster runs:
    #	-rw-rw-r--  1 712923274 Jul 21 13:31 mm9.2bit
    #	-rw-rw-r--  1     17179 Jul 23 16:18 nonBridgedGap.lft
    #	-rw-rw-r--  1    122352 Jul 24 11:32 11.ooc

    ssh hgwdev
    cd /cluster/data/mm9/jkStuff
    gapToLift mm9 nonBridgedGap.lft
# WARNING: gap at end of chromosome at chrY:5902555-15902555
# WARNING: overlapping gap at chrY:2902555-5902555 and chrY:5902555-15902555
    ## These warnings are true, chrY has two gaps next to each other, and
    ## the second one is actually the end of the chrom.  This is the way the
    ## NCBI supplied AGP file is.  (this seems to be normal in hg18 too ...)
    cp -p nonBridgedGap.lft /san/sanvol1/scratch/mm9
    cd ..
    cp -p mm9.rmskTrf.2bit /san/sanvol1/scratch/mm9/mm9.2bit
    ## The genbank.conf entry looks like:
# mm9
mm9.serverGenome = /cluster/data/mm9/mm9.2bit
mm9.clusterGenome = /san/sanvol1/scratch/mm9/mm9.2bit
mm9.ooc = /cluster/data/mm9/11.ooc
mm9.align.unplacedChroms = *
mm9.lift = /cluster/data/mm9/jkStuff/nonBridgedGap.lft
mm9.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
mm9.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
mm9.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
mm9.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
mm9.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
mm9.downloadDir = mm9
mm9.refseq.mrna.xeno.load  = yes
mm9.refseq.mrna.xeno.loadDesc = yes
mm9.mgcTables.default = full
mm9.mgcTables.mgc = all

    ssh kkstore02
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial mm9 &
    ##	var/build/logs/2007.07.26-21:57:22.mm9.initalign.log

    ## logFile: var/build/logs/2007.07.23-16:44:31.mm9.initalign.log
    #	real    771m12.978s
    #  a couple of failed jobs, finish off the align step manually
    ssh kk
    cd /cluster/bluearc/genbank/work/initial.mm9/align
    para time
# Completed: 50580 of 50580 jobs
# CPU time in finished jobs:   14556484s  242608.06m  4043.47h  168.48d  0.462 y
# IO & Wait Time:                988518s   16475.30m   274.59h   11.44d  0.031 y
# Average job time:                 307s       5.12m     0.09h    0.00d
# Longest finished job:            1815s      30.25m     0.50h    0.02d
# Submission to last job:         40513s     675.22m    11.25h    0.47d

    ## after recovery of the alignments jobs
    ssh kkstore02
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -continue=finish -initial mm9 &
    #	var/build/logs/2007.07.27-11:02:00.mm9.initalign.log
    #	real    169m53.124s

    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad mm9
    #	var/dbload/hgwdev/logs/2007.07.27-14:10:22.dbload.log
    #	real    54m55.707s

    ## the two measurements are for two different runs of genbank,
    ## once configured as "ordered" secondly configured as "finished"
    featureBits mm9 refGene:cds
    #	30105171 bases of 2620346127 (1.149%) in intersection
    #	30113840 bases of 2620346127 (1.149%) in intersection
    featureBits mm9 refGene
    #	51164928 bases of 2620346127 (1.953%) in intersection
    #	51175624 bases of 2620346127 (1.953%) in intersection
    featureBits mm9 mrna
    #	135379415 bases of 2620346127 (5.166%) in intersection
    #	137195240 bases of 2620346127 (5.236%) in intersection
    featureBits mm9 mgcGenes
    #	33676155 bases of 2620346127 (1.285%) in intersection
    #	34012201 bases of 2620346127 (1.298%) in intersection
    featureBits mm9 est
    #	184121510 bases of 2620346127 (7.027%) in intersection
    #	188799620 bases of 2620346127 (7.205%) in intersection
    featureBits mm9 intronEst
    #	52305179 bases of 2620346127 (1.996%) in intersection
    #	52812173 bases of 2620346127 (2.015%) in intersection
    featureBits mm9 xenoMrna
    #	46119254 bases of 2620346127 (1.760%) in intersection
    #	51438566 bases of 2620346127 (1.963%) in intersection
    featureBits mm9 xenoRefGene
    #	40378885 bases of 2620346127 (1.541%) in intersection
    #	44298281 bases of 2620346127 (1.691%) in intersection

    # enable daily alignment and update of hgwdev (DONE - 2007-08-03 - Hiram)
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add mm9 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added mm9 - Mus musculus" etc/align.dbs etc/hgwdev.dbs
    make etc-update

#########################################################################
# MAP CONTIGS TRACK (DONE - 2007-07-23 - Hiram)
    ## can take contig information directly from previously created
    ## mm9.contigs.agp
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/ctgPos
    cd /cluster/data/mm9/bed/ctgPos
    grep CONTIG ../../mouse_37/mm9.contigs.agp \
	| awk '{printf "%s\t%d\t%s\t%d\t%d\n", $6, $8, $1, $2-1, $3}' \
	> mm9.ctgPos.tab

    hgsql mm9 < ~/kent/src/hg/lib/ctgPos.sql
    hgsql mm9 -e 'load data local infile "mm9.ctgPos.tab" into table ctgPos;'

    featureBits -countGaps mm9 ctgPos
    #	2623952781 bases of 2725765481 (96.265%) in intersection
    featureBits -countGaps mm8 ctgPos
    #	2573322222 bases of 2664455088 (96.580%) in intersection

#########################################################################
## Create downloads directory (DONE - 2007-07-25 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
    cd /cluster/data/mm9/bed/simpleRepeat/trfMaskChrom
    ln -s ../trf/chr*.bed .

    cd /cluster/data/mm9
    time nice -n +19 /cluster/bin/scripts/makeDownloads.pl mm9 \
	> do.downloads.out 2>&1
    #	real    41m18.282s
    ## failed during jkStuff/doInstall.csh:
# foreach size ( 1000 2000 5000 )
# echo 1000
# featureBits mm9 refGene:upstream:1000 -fa=stdout
# setpriority: Permission denied.
# Error writing 50 bytes: Operation not permitted
    ## remove the "nice" statements from the csh, and finish it off
    ## edit the README files to indicate correct information


##########################################################################
# MGI LIFTOVER FROM MM8 (DONE 2007-07-26 angie)
    ssh kolossus
    mkdir /cluster/data/mm9/bed/jaxLiftOver
    cd /cluster/data/mm9/bed/jaxLiftOver
    ldHgGene -out=stdout -nobin placeholder placeholder \
      /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscript.gff \
    | liftOver stdin -minBlocks=0.5 \
      /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
      -genePred jaxRepTranscriptLift.{gp,unmapped}
#Read 31587 transcripts in 232925 lines in 1 files
    wc -l jaxRepTranscriptLift.{gp,unmapped}
#  31470 jaxRepTranscriptLift.gp
#    234 jaxRepTranscriptLift.unmapped
    liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxAllele.bed \
      /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
      -bedPlus=12 jaxAlleleLift.{bed,unmapped}
    wc -l jaxAlleleLift.{bed,unmapped}
#  12372 jaxAlleleLift.bed
#      2 jaxAlleleLift.unmapped
    liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxPhenotype.bed \
      /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
      -bedPlus=12 -tab jaxPhenotypeLift.{bed,unmapped}
    wc -l jaxPhenotypeLift.{bed,unmapped}
#  23806 jaxPhenotypeLift.bed
#      0 jaxPhenotypeLift.unmapped
    liftOver -minBlocks=0.5 /cluster/data/mm8/bed/jax/2007_07/jaxQtl.bed \
      /cluster/data/mm8/bed/liftOver/mm8ToMm9.over.chain.gz \
      -bedPlus=6 -tab jaxQtlLift.{bed,unmapped}
    wc -l jaxQtlLift.{bed,unmapped}
#  1539 jaxQtlLift.bed
#    12 jaxQtlLift.unmapped

    # Load lifted track tables and original auxiliary tables:
    ssh hgwdev
    cd /cluster/data/mm8/bed/jaxLiftOver
    # jaxRepTranscriptLift
    ldHgGene -predTab mm9 jaxRepTranscriptLift jaxRepTranscriptLift.gp
#31470 gene predictions
    sed -e 's/jaxRepTranscript/jaxRepTranscriptLift/g' \
      /cluster/data/mm8/bed/jax/2007_07/fixJaxRepTranscript.sql \
      > fixJaxRepTranscriptLift.sql
    hgsql mm9 < fixJaxRepTranscriptLift.sql
    hgLoadSqlTab mm9 jaxRepTranscriptAlias \
      /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.sql \
      /cluster/data/mm8/bed/jax/2007_07/jaxRepTranscriptAlias.tab
    hgsql mm9 -e 'rename table jaxRepTranscriptAlias to jaxRepTranscriptLiftAlias;'
    # jaxAlleleLift
    sed -e 's/bed12Source/jaxAlleleLift/g' \
      $HOME/kent/src/hg/lib/bed12Source.sql > jaxAlleleLift.sql
    hgLoadBed -sqlTable=jaxAlleleLift.sql mm9 jaxAlleleLift jaxAlleleLift.bed
#Loaded 12372 elements of size 13
    sed -e 's/jaxAllele/jaxAlleleLift/g' \
      /cluster/data/mm8/bed/jax/2007_07/fixJaxAllele.sql > fixJaxAlleleLift.sql
    # empty file, but just in case it has something in the future...
    hgsql mm9 < fixJaxAlleleLift.sql
    hgLoadSqlTab mm9 jaxAlleleInfo \
      ~/kent/src/hg/lib/jaxAlleleInfo.sql \
      /cluster/data/mm8/bed/jax/2007_07/jaxAlleleInfo.tab
    # jaxPhenotypeLift
    sed -e 's/bed12Source/jaxPhenotypeLift/g' \
      ~/kent/src/hg/lib/bed12Source.sql > jaxPhenotypeLift.sql
    hgLoadBed -tab -sqlTable=jaxPhenotypeLift.sql mm9 jaxPhenotypeLift \
      jaxPhenotypeLift.bed
#Loaded 23806 elements of size 13
    sed -e 's/jaxPhenotype/jaxPhenotypeLift/g' \
      /cluster/data/mm8/bed/jax/2007_07/fixJaxPhenotype.sql \
      > fixJaxPhenotypeLift.sql
    # empty file, but just in case it has something in the future...
    hgsql mm9 < fixJaxPhenotypeLift.sql
    hgLoadSqlTab mm9 jaxPhenotypeAlias \
      /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.sql \
      /cluster/data/mm8/bed/jax/2007_07/jaxPhenotypeAlias.tab
    hgsql mm9 -e 'rename table jaxPhenotypeAlias to jaxPhenotypeLiftAlias;'
    # phenotype-allele relationships
    hgLoadSqlTab mm9 jaxAllelePheno \
      ~/kent/src/hg/lib/jaxAllelePheno.sql \
      /cluster/data/mm8/bed/jax/2007_07/jaxAllelePheno.tab
    # jaxQTLLift
    sed -e 's/jaxQTL/jaxQTLLift/g'\
      ~/kent/src/hg/lib/jaxQTL.sql  > jaxQTLLift.sql
    hgLoadBed -tab -notItemRgb -noBin \
      -sqlTable=jaxQTLLift.sql \
      mm9 jaxQTLLift jaxQtlLift.bed
#Loaded 1539 elements of size 10
    # Add row to mm9.grp for Phenotype and Allele track group:
    hgsql mm9 -e 'insert into grp values("phenoAllele", "Phenotype and Allele", 4.5);'


##########################################################################
## Creating pushQ (DONE - 2007-07-26 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/pushQ
    cd /cluster/data/mm9/pushQ
    /cluster/bin/scripts/makePushQSql.pl mm9 > mm9.sql 2> stderr.out
    ## check the stderr.out for anything that needs to be fixed
    ## copy mm9.sql to hgwbeta:/tmp
    scp mm9.sql hgwbeta:/tmp
    ## then on hgwbeta
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < mm9.sql

#############################################################################
# STS MARKERS DATA DOWNLOAD (DONE - 2007-07-26 - Hiram)
    ssh kkstore06
    mkdir -p /cluster/data/mm9/bed/STSmarkers/downloads
    cd /cluster/data/mm9/bed/STSmarkers/downloads
    # these files appear to be new almost every day
    time nice -n +19 wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_mouse.sts
    time nice -n +19 wget --timestamping \
	ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases

    #	The new feature in the .aliases file this time are names with
    #	spaces in them !  This changes our parsing business below,
    #	hopefully the spaces in the names won't cause trouble elsewhere.

    time nice -n +19 wget --timestamping \
ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Mus_musculus/*

    # these reports from jax.org appear to be changing daily
    time nice -n +19 wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Dump2.rpt
    time nice -n +19 wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/MRK_Sequence.rpt
    time nice -n +19 wget --timestamping \
	ftp://ftp.informatics.jax.org/pub/reports/PRB_PrimerSeq.rpt
    ls -ogrt
#	-rw-rw-r--  1      676 Mar 11  2004 README
#	-rw-rw-r--  1   396858 Jan 28  2005 10090.MGI.txt
#	-rw-rw-r--  1   390139 Mar 16  2005 10090.WI_MRC_RH.txt
#	-rw-rw-r--  1   240688 Mar 16  2005 10090.WI-YAC.txt
#	-rw-rw-r--  1   173344 Mar 16  2005 10090.WI-Genetic.txt
#	-rw-rw-r--  1 25691253 Jan 13  2006 UniSTS.aliases
#	-rw-rw-r--  1  4582158 Jul  5 11:40 UniSTS_mouse.sts
#	-rw-rw-r--  1  2841773 Jul 26 03:13 PRB_PrimerSeq.rpt
#	-rw-rw-r--  1  5149790 Jul 26 03:13 MRK_Sequence.rpt
#	-rw-rw-r--  1  5697140 Jul 26 03:13 MRK_Dump2.rpt

    #	 I note the UniSTS.aliases file is over twice as big as was in
    #	 Mm7 build.  I wonder what got into it ...
    #	What got into it was that it was completely broken.  It appeared
    #	to have a vast section of itself duplicated again in the file.
    #	It was cleaned up via:
    echo -e "#Unique ID\tAliases" > uniqueSTS.aliases
    grep -v "^#" UniSTS.aliases | sort -n | uniq >> uniqueSTS.aliases
    mv UniSTS.aliases UniSTS.aliases.broken
    mv uniqueSTS.aliases UniSTS.aliases

    # back to our work area, update the bed file
    #	to do this we need a new UniSTS_mouse.alias file
    # it is created by a combination of information from several
    # of the above files ! AND ! the previous stsInfoMouse.bed file
    # the db reference here is to the previous build
    time nice -n +19 ~/kent/src/hg/stsMarkers/fetchAllAliases.csh mm8

    #	Here is a normal set of errors:
# processing UniSTS_mouse.sts to find aliases
# #       ERROR: KNOWN(==OK) duplicate ID: '108991' encountered at line
# #       2384
# processing MGI.aliases
# fetching existing aliases from previous stsInfoMouse.bed file
# found 27648 potential errors in
#	/cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed
# to see the errors: grep ERROR stsInfoAliases.txt
# verify those stsInfoMouse.bed aliases with UniSTS.aliases

    #	those errors in the previous stsInfoMouse.bed file are an
    #	accumulation of errors from a long long time ago in this chain
    #	of processing.  Some day it might be nice to fix them, but they
    #	don't seem to bother anything, so they continue to be carried
    #	forward, and a couple of new ones are added with each assembly.

####################################################################
##  STS markers data processing track (DONE - 2007-07-26 - Hiram)
    ssh hgwdev
    cd /cluster/data/mm9/bed/STSmarkers
    # create a new stsInfoMouse.bed file:
    #	Update the m m 8 directory name here to m m 9
    #	for the next build of m m 10,  ...etc... and so forth
    time ~/kent/src/hg/stsMarkers/updateBed.pl \
        /cluster/data/mm8/bed/STSmarkers/stsInfoMouse.bed \
        downloads/MRK_Dump2.rpt \
	downloads/PRB_PrimerSeq.rpt \
        downloads/MRK_Sequence.rpt \
	downloads/UniSTS_mouse.alias \
        downloads/UniSTS_mouse.sts \
        -g downloads/10090.WI-Genetic.txt \
        -r downloads/10090.WI_MRC_RH.txt \
        -verbose 2> dbg.updateBed | sed -e "s/\t*$//" > newbedfile

    ~/kent/src/hg/stsMarkers/cleanInfo.pl -mouse newbedfile \
        | sed -e "s/\t*$//" > mm9.stsInfoMouse.bed

    # copy the stsInfoMouse.bed file from working dir to the marker
    #	info storage fold.  added 2 new steps by Yontao	
    #	be wary of the archive name here, check the directory and get
    #	the name right here.
    mv /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
    cp -p mm9.stsInfoMouse.bed /cluster/store5/mouseMarker/stsInfoMouse.bed

    # comparing to previous, numbers increase slightly each time
    wc -l /cluster/store5/mouseMarker/stsInfoMouse.bed \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm7 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm6 \
	/cluster/store5/mouseMarker/stsInfoMouse.bed_mm5
    #	66782 /cluster/store5/mouseMarker/stsInfoMouse.bed
    #	60631 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm8.secondTime
    #	59843 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm7
    #	58980 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm6
    #	58493 /cluster/store5/mouseMarker/stsInfoMouse.bed_mm5

    # and from that, create new primer fa, epcr, etc:
    time ~/kent/src/hg/stsMarkers/luConvertPrimerToFa \
	mm9.stsInfoMouse.bed mouseP.fa mouseC.fa mouseP.info
    # the mouseC.fa file will be empty, should be more than last time
    wc -l mouse?.*

    #       0 mouseC.fa
    #  359647 mouseP.fa
    #   41247 mouseP.info

    #	the equivalent Mm8 files:
    #	     0 mouseC.fa
    #	308384 mouseP.fa
    #	 34666 mouseP.info

    #	copy the primers over to some filesystem close to the klusters
    #	and split them up to have a small number of sequences in one file

    mkdir /cluster/bluearc/mm9/stsMarkers
    cp -p mouseP.fa /cluster/bluearc/mm9/stsMarkers
    cd /cluster/bluearc/mm9/stsMarkers
    cp -p /cluster/data/mm9/11.ooc .
    mkdir split
    #	356 files for 41,247 sequences, == about 116 sequences per file
    faSplit sequence mouseP.fa 400 split/mm_

    # PLEASE NOTE /cluster/bin/i386/blat.2 SPECIFICALLY IS USED HERE. 
    #	This process could convert to a modern version of blat with the
    #	filters as described, for example, in the STS markers build in Hg18

    #  CLUSTER RUN FOR THE STS PRIMERS
    ssh kk
    cd /cluster/data/mm9/bed/STSmarkers
    mkdir primer
    mkdir ePCR
    cd primer
    mkdir out

    #	interestingly, this blat2.2 binary did not function correctly
    #	when given nib files.  It has only about 1/4th of the number of
    #	alignments as it gets when it used fa files for the target
    #	sequence.

    ls -1S /cluster/bluearc/mm9/stsMarkers/split > primers.list
    #	will fetch chrom sequences from the 2bit file
    cut -f1 /cluster/data/mm9/chrom.sizes > chr.list

    ## next time, make this script produce its results in /scratch/tmp
    ## then move result file to output instead of writing result
    ## to output
    cat << '_EOF_' > runBlat2
#!/bin/csh -fe
set primer = /cluster/bluearc/mm9/stsMarkers/split/$1
set root1 = $1:r
set fa = $root1.$2.fa
set ooc = /cluster/bluearc/mm9/stsMarkers/11.ooc
set root2 = $2:r
set tmpDir = /scratch/tmp/$root1.$root2
mkdir $tmpDir
mkdir -p out/${root2}
set out = $3
pushd $tmpDir
twoBitToFa -seq=$2 /iscratch/i/mus/mm9/mm9.2bit ${fa}
cp -p ${primer} primer.fa
cp -p ${ooc} 11.ooc

/cluster/bin/i386/blat.2 ${fa} primer.fa -ooc=11.ooc \
        -minMatch=1 -minScore=0 -minIdentity=80 -oneOff result.psl
popd
cp -p ${tmpDir}/result.psl ${out}
rm -fr ${tmpDir}
'_EOF_'
    #	<< happy emacs
    chmod +x runBlat2

    cat << '_EOF_' > template
#LOOP
./runBlat2 $(path1) $(path2) {check out line+ out/$(root2)/$(root1).psl}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 primers.list chr.list template jobList
    para create jobList
    para try ... check ... push ... etc ...
# Completed: 12425 of 12425 jobs
# CPU time in finished jobs:    1438098s   23968.31m   399.47h   16.64d  0.046 y
# IO & Wait Time:                237582s    3959.69m    65.99h    2.75d  0.008 y
# Average job time:                 135s       2.25m     0.04h    0.00d
# Longest finished job:            2150s      35.83m     0.60h    0.02d
# Submission to last job:          4736s      78.93m     1.32h    0.05d

    # on the file server
    ssh kkstore06
    cd /cluster/data/mm9/bed/STSmarkers/primer
    time nice -n +19 pslSort dirs primers.raw.psl temp out/chr*
    #	real    1m34.193s
    #	-rw-rw-r--   1 700293557 Aug  6 10:22 primers.raw.psl

    #	filter alignments for (qEnd-qStart) vs. (tEnd-tStart)
    #	should not be more than 100 bases different.
    #	This filters out about 948,260 alignments, or
    #	%17.4 = 100.0 * 948260 / 5462936
    time nice -n +19 pslSort dirs stdout temp out/chr* | awk -F"\t" '
{ if (((($13 - $12) - ($17 - $16)) > -100) &&
	((($13 - $12) - ($17 - $16)) < 100)) {print}
}
' > primers.100.psl

    rmdir temp

    wc -l *.psl
    #	5340677 primers.100.psl
    #	6498150 primers.raw.psl

    echo "6498150-5340677" | bc -q
    #	1157473 difference

    # a rough comparison with previous results:

    wc -l primers.100.psl \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100
# 5340677 primers.100.psl
# 4514676 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.100

    # another kluster run for the ePCR
    ssh pk
    cd /cluster/data/mm9/bed/STSmarkers/ePCR
    cut -f1 /cluster/data/mm9/chrom.sizes > chr.list

    #	Using previously fetched e-PCR source from
    #	ftp://ftp.ncbi.nlm.nih.gov/pub/schuler/e-PCR/
    #	version 2.3.1 11 Feb 2005
    #	Had to add the following to both re-PCR_main.cpp and
    #	e-PCR_main.cpp to get them to compile on kolossus:
// max and min Copied from /usr/include/mysql/my_global.h
#define max(a, b)       ((a) >? (b))
#define min(a, b)       ((a) <? (b))

    mkdir out
    cat << '_EOF_' > runPCR
#!/bin/csh -fe
set chr = $1
set out = $2
set wrkdir = /scratch/tmp/epcr.mm9.$chr
set fa = $chr.fa
set tmpResult = $chr.result.epcr
mkdir $wrkdir
twoBitToFa -seq=$chr /san/sanvol1/scratch/mm9/mm9.2bit $wrkdir/$fa
pushd $wrkdir
/cluster/bin/x86_64/e-PCR \
    /cluster/data/mm9/bed/STSmarkers/mouseP.info $fa N=1 M=50 W=5 > $tmpResult
popd
cp -p $wrkdir/$tmpResult $out
rm $wrkdir/$tmpResult
rm $wrkdir/$fa
rmdir $wrkdir
'_EOF_'
    # << happy emacs
    chmod +x runPCR

    cat << '_EOF_' > template
#LOOP
./runPCR $(path1) {check out line+ out/$(root1).epcr}
#ENDLOOP
'_EOF_'
    # << the mouseP.info was created above
    gensub2 chr.list single template jobList
    para create jobList
    para try
    para check
    para push
    ... etc ...
    ## two of those produce zero results:
    #	-rw-rw-r--  1      0 Aug  6 12:53 chr3_random.epcr
    #	-rw-rw-r--  1      0 Aug  6 12:53 chr16_random.epcr
    ## hence, the two crashed jobs in the check display:
# Completed: 33 of 35 jobs
# Crashed: 2 jobs
# CPU time in finished jobs:      80940s    1349.01m    22.48h    0.94d  0.003 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                2327s      38.78m     0.65h    0.03d
# Longest finished job:            6980s     116.33m     1.94h    0.08d
# Submission to last job:         15589s     259.82m     4.33h    0.18d

    ssh kkstore06
    cd /cluster/data/mm9/bed/STSmarkers/ePCR
    # all those results become all.epcr
    cat out/*.epcr > all.epcr

    # comparing to previous results, should have more with new results:
    wc -l all.epcr /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr
    #	87623 all.epcr
    #	58162 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ePCR/all.epcr

    cd /cluster/data/mm9/bed/STSmarkers/primer

    ~/kent/src/hg/stsMarkers/filterSTSPrimers \
    -mouse ../mm9.stsInfoMouse.bed primers.100.psl \
        ../mouseP.info ../ePCR/all.epcr > primers.psl.filter.blat

    #  The output should show an increasing count:
    #	Reading name info from: ../mm9.stsInfoMouse.bed
    #	Reading primer info from: ../mouseP.info
    #	Reading ePCR info from: ../ePCR/all.epcr
    #	Reading alignment results from: primers.100.psl
    #	100000
    #	200000
    #	...
    #	5200000
    #	5300000
    #	Determining ePCR not found from ePCR results
    #	Out of 26332 ePCR alignments examined, not found: 527

    ## compare with previous build results
    wc -l primers.psl.filter.blat \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
# 35537 primers.psl.filter.blat
# 34043 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat

    ## ouch, mm9 does not have lift files for contigs to chroms
    ## let's make a contig lift file
    cd /cluster/data/mm9/jkStuff
    cp -p /cluster/data/cb3/jkStuff/agpToLift.pl .
    grep CONTIG ../mouse_37/mm9.contigs.agp \
	| ./agpToLift.pl /dev/stdin > mm9.contigs.lift
    awk '{if (! match($5,"N")) print}' ../mouse_37/mm9.fragments.agp \
	| /cluster/data/rn3/jkStuff/agpToLift.pl ../chrom.sizes /dev/stdin \
	> mm9.fragments.lift
    cd ..
    mkdir ctgLifts
    splitFileByColumn -col=4 jkStuff/mm9.contigs.lift ctgLifts
    mkdir fragmentLifts
    splitFileByColumn -col=4 jkStuff/mm9.fragments.lift fragmentLifts
    
    ## distribute those in the old-style lift directory hierarchy
    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
do
    rm -fr  ${C}/lift
done

    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
do
    mkdir -p ${C}/lift
    if [ -f ctgLifts/chr${C}.contigs.lift ]; then
	cp -p ctgLifts/chr${C}.contigs.lift ${C}/lift/ordered.lft
    fi
    if [ -f ctgLifts/chr${C}_random.contigs.lift ]; then
	cp -p ctgLifts/chr${C}_random.contigs.lift ${C}/lift/random.lft
    fi
done
    ## not the fragments
#    for C in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 Un X Y M
#do
#    mkdir -p ${C}/lift
#    if [ -f fragmentLifts/chr${C}.fragments.lift ]; then
#	cp -p fragmentLifts/chr${C}.fragments.lift ${C}/lift/ordered.lft
#    fi
#    if [ -f fragmentLifts/chr${C}_random.fragments.lift ]; then
#	cp -p fragmentLifts/chr${C}_random.fragments.lift ${C}/lift/random.lft
#    fi
#done

    ## now, after that side trip, back to the primer business
    # create file accession_info.rdb
    touch empty_sequence.inf
    ~/kent/src/hg/stsMarkers/compileAccInfo -mouse \
	/cluster/data/mm9 empty_sequence.inf
    #	20363 processed
    mv accession_info.rdb accession_info.rdb.tmp
    ~/kent/src/hg/stsMarkers/sorttbl -x Chr Ord Start \
	< accession_info.rdb.tmp > accession_info.rdb
    #	The -x prints the debug statement:
    #	sort arg:  -t"  " +0 -1 +1 -2g +2 -3g
    rm accession_info.rdb.tmp

    # comparing results to previous
    #	Continuing the trend that began with Mm7, the numbers in
    #	accession_info.rdb continue to decrease.  Even Mm8 has much less
    #	fragments than did mm7:
    #	e.g.:
    [hiram@kkstore06 /cluster/data] wc -l mm9/?/chr*.agp mm9/??/chr*.agp | tail -1
    #	21699 total
    [hiram@kkstore06 /cluster/data] wc -l mm8/*/chr*.agp | tail -1
    #	21910 total
    [hiram@kkstore06 /cluster/data] wc -l mm7/*/chr*.agp | tail -1
    #	70125 total
    [hiram@kkstore06 /cluster/data] wc -l mm6/*/chr*.agp | tail -1
    #	170812 total

    wc -l accession_info.rdb \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.blat
    #	20333 accession_info.rdb
    #	20385 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/accession_info.rdb

    # creates epcr.not.found.nomatch and epcr.not.found.psl
    ~/kent/src/hg/stsMarkers/epcrToPsl -mouse \
	epcr.not.found ../mouseP.info \
	accession_info.rdb /cluster/data/mm9/mm9.2bit 2> dbg.epcrToPsl
    #	the dbg.epcrToPsl has a number of lines complaining about bad
    #	primers in ../mouseP.info - and indeed they are bad primers,
    #	they do not have a second primer.

    # Comparing results to previous:
    wc -l epcr* \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr*
# 527 epcr.not.found
# 0 epcr.not.found.nomatch
# 527 epcr.not.found.psl
# 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found
# 0 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.nomatch
# 520 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/epcr.not.found.psl

    # Mm7 wc epcr*
    wc -l /cluster/data/mm7/bed/STSmarkers/primer/epcr*
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found
    #	   0 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.nomatch
    #	 474 /cluster/data/mm7/bed/STSmarkers/primer/epcr.not.found.psl
    #	 158 /cluster/data/mm7/bed/STSmarkers/primer/epcrToPsl
    #	1106 total

    cat primers.psl.filter.blat epcr.not.found.psl > primers.psl.filter
    wc -l primers.psl.filter \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
    #	36064 primers.psl.filter
    #	34563 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter
    # create primers.psl.filter.lifted.initial
    #	The PATH setting allows extractPslInfo to find other programs that it
    #	is going to use.
    PATH=~/kent/src/hg/stsMarkers:$PATH \
	~/kent/src/hg/stsMarkers/extractPslInfo primers.psl.filter

    wc -l *.initial \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial
# 36040 primers.psl.filter.initial
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial

    # create primers.psl.filter.lifted.initial.acc
    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/findAccession -agp \
	-mouse primers.psl.filter.initial /cluster/data/mm9
    wc -l *.initial.acc /cluster/data/mm8/bed\
/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc
# 36040 primers.psl.filter.initial.acc
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.psl.filter.initial.acc


    # this needs to be -rat as that specifies how to scan the
    # stsInfoMouse.bed file and it does not work if you use -mouse
    # it is not clear what -mouse would mean to this script, some other file
    # format perhaps from the stsInfoMouse.bed format.
    ~/kent/src/hg/stsMarkers/getStsId -rat \
	../mm9.stsInfoMouse.bed  primers.psl.filter.initial.acc \
	| sort -k4,4n > primers.final
    wc -l primers.final \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final
# 36040 primers.final
# 34545 /cluster/data/mm8/bed/STSmarkers.2006-08-29/primer/primers.final

    cd /cluster/data/mm9/bed/STSmarkers
    # stsMarkers.final is empty for mouse
    touch stsMarkers.final dummy
    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/combineSeqPrimerPos \
	stsMarkers.final primer/primers.final > stsMarkers_pos.rdb
    wc -l stsMarkers_pos.rdb \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb
# 34232 stsMarkers_pos.rdb
# 33048 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMarkers_pos.rdb

    PATH=~/kent/src/hg/stsMarkers:$PATH \
    ~/kent/src/hg/stsMarkers/createStsBed \
	mm9.stsInfoMouse.bed  stsMarkers_pos.rdb 500 \
	| sort -k1,1 -k2,2n | sed -e "s/ //g" > stsMapMouse.bed
    #	The sed removes unneeded blanks
    #	verify score profile remains similar
    awk -F'\t' '{print $5}' stsMapMouse.bed | sort -n | uniq -c
    #	   591 500
    #	  1774 750
    #	 28529 1000
    awk -F'\t' '{print $5}' \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed \
	| sort -n | uniq -c
    #	  546 500
    #	 1650 750
    #	27705 1000

    wc -l stsMapMouse.bed \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
# 30894 stsMapMouse.bed
# 29901 /cluster/data/mm8/bed/STSmarkers.2006-08-29/stsMapMouse.bed
    ## check the names, look for odd ones
    ##  the bogus names "-" were fixed for mm9
    awk -F'\t' '{print $4}' stsMapMouse.bed | sort | head
    awk -F'\t' '{print $4}' stsMapMouse.bed | sort | tail

    #  loading STS markers tables
    ssh hgwdev
    cd /cluster/data/mm9/bed/STSmarkers
    ~/kent/src/hg/stsMarkers/ucscAlias.pl \
	mm9.stsInfoMouse.bed > ucscStsAlias.tab 2> ucscStsAlias.warnings
    #	this does leave messages in ucscStsAlias.warnings but they seem
    #	to be very similar to Mm6 with just a few new ones
     
    wc -l ucscStsAlias.tab \
	/cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab
# 146359 ucscStsAlias.tab
# 146767 /cluster/data/mm8/bed/STSmarkers.2006-08-29/ucscStsAlias.tab


    ssh hgwdev
    cd /cluster/data/mm9/bed/STSmarkers
    ## when reloading:
    hgsql -e "drop table stsAlias;" mm9
    hgsql -e "drop table stsMapMouseNew;" mm9
    hgsql -e "drop table stsInfoMouseNew;" mm9

    hgsql mm9 < ~/kent/src/hg/lib/stsAlias.sql
    hgsql -e \
	'load data local infile "ucscStsAlias.tab" into table stsAlias;' mm9
    hgsql mm9 < ~/kent/src/hg/lib/stsMapMouseNew.sql
    hgsql -e \
'load data local infile "stsMapMouse.bed" into table stsMapMouseNew;' mm9
    hgsql mm9 < ~/kent/src/hg/lib/stsInfoMouseNew.sql
    hgsql -e \
     'load data local infile "mm9.stsInfoMouse.bed" into table stsInfoMouseNew;' mm9

    hgsql -e "drop table all_sts_primer;" mm9
    hgLoadPsl -nobin -table=all_sts_primer mm9 primer/primers.psl.filter
# load of all_sts_primer did not go as planned: 36064 record(s),
#	0 row(s) skipped, 1 warning(s) loading primer/primers.psl.filter
    #	After warnings, checkTableCoords to find problems:
    checkTableCoords -verboseBlocks mm9 all_sts_primer
mm9.all_sts_primer item 61999 chr10:62485403-62485439: blocks 0 and 1 overlap.
mm9.all_sts_primer has 1 records with overlapping blocks.
    #	Strip the offending item from the load
    hgsql -e 'delete from all_sts_primer where tName="chr10" AND tStart=62485403 AND tEnd=62485439;' mm9

    # load primer sequences	
    mkdir /gbdb/mm9/stsMarker
    ln -s /cluster/data/mm9/bed/STSmarkers/mouseP.fa \
	/gbdb/mm9/stsMarker/mouseP.fa
    # PLEASE NOTE THAT THE If you are going to reload this business, use the
    #	-replace option on this hgLoadSeq
    #	hgLoadSeq -replace mm9 /gbdb/mm9/stsMarker/mouseP.fa
    # otherwise there will be a problem that the seq and extFile tables 
    # will be out of sync. 
    hgLoadSeq -replace  mm9 /gbdb/mm9/stsMarker/mouseP.fa
    #  Adding /gbdb/mm9/stsMarker/mouseP.fa
    #	41247 sequences
    #	Warning: load of seq did not go as planned: 41330 record(s), 0 row(s)
    #	skipped, 1 warning(s) loading ./seq.tab

    ## joinerCheck should be clean:
    joinerCheck -keys -identifier=mouseStsTrueName -database=mm9 all.joiner
# Checking keys on database mm9
#  mm9.stsAlias.trueName - hits 146350 of 146359 ok
#  mm9.all_sts_primer.qName - hits 35537 of 36063 ok
#  mm9.stsMapMouseNew.name - hits 30894 of 30894 ok

    featureBits mm9 all_sts_primer
    #	3795229 bases of 2620346127 (0.145%) in intersection
    featureBits mm8 all_sts_primer
    #	3700897 bases of 2567283971 (0.144%) in intersection
    featureBits mm9 stsMapMouseNew
    #	4884563 bases of 2620346127 (0.186%) in intersection
    featureBits mm8 stsMapMouseNew
    #	4812616 bases of 2567283971 (0.187%) in intersection

    hgsql -N mm9 -e "select count(*) from stsAlias;"
    #	146359
    hgsql -N mm8 -e "select count(*) from stsAlias;"
    #	146767
    hgsql -N mm9 -e "select count(*) from stsInfoMouseNew;"
    #	66782
    hgsql -N mm8 -e "select count(*) from stsInfoMouseNew;"
    #	60631

    #	compare old and new name lists, not much difference:
    awk '{print $4}' stsMapMouse.bed | sort -u > mm9.nameList
    #	in common with previous version:
    comm -12 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
	mm9.nameList | wc -l
    #	28596
    #	unique to previous version:
    comm -23 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
	mm9.nameList | wc -l
    #	111
    #	unique to this new set:
    comm -13 \
/cluster/data/mm8/bed/STSmarkers.2006-08-29/mm8.nameList \
	mm9.nameList | wc -l
    #	1017

###########################################################################
#	Reset default position to be same area as Mm8, 2007-08-02 - Hiram
    hgsql -e \
'update dbDb set defaultPos="chr12:57795963-57815592" where name="mm9";' \
	hgcentraltest

##############################################################################
# CLONE ENDS - BACEND TRACK (DONE - 2007-08-02 - 2007-08-03 - Hiram)
    ssh kkstore06
    cd /cluster/data/mm9
    # check disk space: 1.2T free
    df -h .
# Filesystem            Size  Used Avail Use% Mounted on
# /export/cluster/store4
#			2.3T  997G  1.2T  46% /cluster/store4
    mkdir -p bed/cloneend/ncbi
    cd bed/cloneend/ncbi

    wget --timestamping \
	ftp://ftp.ncbi.nih.gov/genomes/CLONEEND/mus_musculus/*

    cd /cluster/data/mm9/bed/cloneend
    # seems like the *.mfa files were split just for convenience
    # concatenate, and convert the title line of the fasta sequences
    cat << '_EOF_' > convert.pl
#!/usr/bin/env perl

use strict;
use warnings;

while (my $line = <>) {
    if ($line !~ m/^>/) {
	print $line
    } else {
        my @fields = split('\|', $line);
	my $fieldCount = scalar(@fields);
        my $printed = 0;
        for (my $i = 0; $i < $fieldCount; $i++) {
                if ($fields[$i] eq "gb" || $fields[$i] eq "dbj") {
                        (my $name, my $vers) = split(/\./,$fields[$i+1]);
                        print ">$name\n";
                        $i= $fieldCount;
                        $printed = 1;
                }
        }
	die("Failed for $line\n") if (!$printed);
    }
}
'_EOF_'
    # << happy emacs
    chmod +x convert.pl
    for F in ncbi/*.mfa.gz
    do
	zcat ${F}
    done | ./convert.pl | gzip > cloneEnds.fa.gz

    #	make sure nothing got broken:
    faSize ncbi/*.mfa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper
#	176421214 lower) in 789466 sequences in 44 files

    faSize cloneEnds.fa.gz
# 498162791 bases (16779168 N's 481383623 real 304962409 upper
#	176421214 lower) in 789466 sequences in 1 files
    #	identical numbers, curiously, these are exactly the same numbers
    #	as were seen during the build of Mm7.  Do these things not
    #	change with time ?

    # concatenate the text files, too
    for F in ncbi/*.txt.gz
    do
	zcat ${F}
    done | gzip > all.txt.gz

    # generate cloneEndPairs.txt and cloneEndSingles.txt
    zcat all.txt.gz | ~/kent/src/hg/utils/cloneEndParse.pl /dev/stdin
    #	Reading in end info
    #	Writing out pair info
    #	Writing out singleton info
    #	354485 pairs and 78423 singles


    #	faSplit does not function correctly if given a .gz source file
    #	AND, we need the unzipped file for sequence loading below
    gunzip cloneEnds.fa.gz
    # split
    mkdir split
    cd split
    ## adjust split size based on previous kluster performance, see below
    faSplit sequence ../cloneEnds.fa 500 cloneEnds
    #	Check to ensure no breakage:
    faSize c*.fa
# 498162791 bases (16779168 N's 481383623 real 304962409 upper 176421214
# lower) in 789466 sequences in 98 files
# %35.41 masked total, %36.65 masked real
    #	same numbers as before

    #	Copy to san for cluster runs
    mkdir /san/sanvol1/scratch/mm9/cloneEnds
    rsync -a --progress --stats ./ /san/sanvol1/scratch/mm9/cloneEnds/
    rm *
    cd ..
    rmdir split
    #	may as well remove the previous assembly copy:
    rm -fr /san/sanvol1/scratch/mm8/cloneEnds

    # load sequences
    ssh hgwdev
    mkdir /gbdb/mm9/cloneend
    cd /gbdb/mm9/cloneend
    ln -s /cluster/data/mm9/bed/cloneend/cloneEnds.fa .
    cd /tmp
    hgLoadSeq mm9 /gbdb/mm9/cloneend/cloneEnds.fa
    #  Advisory lock created
    # Creating .tab file
    # Adding /gbdb/mm9/cloneend/cloneEnds.fa
    # 789466 sequences
    # Updating seq table
    # Advisory lock has been released
    # All done
    ## clean up garbage
    rm seq.tab

############################################################################
# BACEND SEQUENCE ALIGNMENTS (DONE - 2007-08-06 - Hiram)
    ssh kkstore06
    mkdir /cluster/data/mm9/noMask
    cd /cluster/data/mm9/
    #	Need an unmasked sequence for this work
    for C in `cut -f1 chrom.sizes`
do
    echo twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
    twoBitToFa -noMask -seq=${C} mm9.2bit noMask/${C}.fa
done
    # verify nothing broken
    faSize noMask/c*.fa
# 2725765481 bases (105419509 N's 2620345972 real 2620345972 upper 0 lower) in
# 35 sequences in 35 files
    # note, this was the same as long ago when the mm9.2bit was measured:
# 2725765481 bases (105419509 N's 2620345972 real 1466644650 upper
#	1153701322 lower) in 35 sequences in 1 files

    # copy to san for kluster run
    mkdir /san/sanvol1/scratch/mm9/noMask
    rsync -a --progress --stats noMask/ /san/sanvol1/scratch/mm9/noMask/

    # 11.ooc file is already there from the genbank build
    
    #	and now for the kluster run
    ssh pk
    mkdir /cluster/data/mm9/bed/bacends
    cd /cluster/data/mm9/bed/bacends
    mkdir out

    # allow blat to run politely in /tmp while it writes output, then
    # copy results to results file:
    cat << '_EOF_' > runBlat
#!/bin/csh -fe 
set root1 = $1
set root2 = $2
set result = $3
rm -fr /scratch/tmp/${root1}_${root2}
mkdir /scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/11.ooc /scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/noMask/${root1}.fa \
	/scratch/tmp/${root1}_${root2}
cp -p /san/sanvol1/scratch/mm9/cloneEnds/${root2}.fa \
	/scratch/tmp/${root1}_${root2}
pushd /scratch/tmp/${root1}_${root2}
/cluster/bin/x86_64/blat ${root1}.fa ${root2}.fa \
	-ooc=11.ooc ${root1}.${root2}.psl
popd
mkdir -p out/${root2}
rm -f ${result}
cp -p /scratch/tmp/${root1}_${root2}/${root1}.${root2}.psl ${result}
rm -fr /scratch/tmp/${root1}_${root2}
'_EOF_'
    #	<< happy emacs
    chmod +x runBlat

    cat << '_EOF_' > template
#LOOP
./runBlat $(root1) $(root2) {check out line+ out/$(root2)/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << emacs happy

    ls -1S /san/sanvol1/scratch/mm9/cloneEnds/cloneEnds*.fa > bacEnds.lst
    ls -1S /san/sanvol1/scratch/mm9/noMask/chr*.fa > chrom.lst
    gensub2 chrom.lst bacEnds.lst template jobList
    para create jobList
    # 17150 jobs written to batch
    para try, check, push, etc ...
# Completed: 17150 of 17150 jobs
# CPU time in finished jobs:     698826s   11647.09m   194.12h    8.09d  0.022 y
# IO & Wait Time:                262556s    4375.94m    72.93h    3.04d  0.008 y
# Average job time:                  56s       0.93m     0.02h    0.00d
# Longest finished job:             332s       5.53m     0.09h    0.00d
# Submission to last job:        250536s    4175.60m    69.59h    2.90d

    ssh kkstore06
    cd /cluster/data/mm9/bed/bacends
    screen

    mkdir temp
    time nice -n +19 pslSort dirs raw.psl temp out/* > pslSort.out 2>&1 &
    #	real    22m4.019s
    #	-rw-rw-r--    1 8423154460 Aug  6 13:40 raw.psl

    time nice -n +19 pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
	-noIntrons raw.psl bacEnds.psl /dev/null > pslReps.out 2>&1 &
    #	real    6m1.174s
    #	-rw-rw-r--    1 1236810588 Aug  6 13:51 bacEnds.psl

    #	split this large psl file into pieces with 100,000 lines each
    #	to prepare for a sort
    time nice -n +19 ~/kent/src/hg/pslSplitOnTarget/pslSplitLineCount.pl \
	100000 bacEnds.psl split/bacends
    #	real    0m15.389s

    #	save original file, then sort
    mv bacEnds.psl bacEnds.psl.save
    time pslSort dirs bacEnds.psl temp split
    #	real    2m19.131s
    #	-rw-rw-r--    1 1236810588 Aug  6 14:38 bacEnds.psl

    ## compare to previous results
    wc -l bacEnds.psl /cluster/data/mm8/bed/bacends/bacEnds.psl
    #	10294737 bacEnds.psl
    #	10229750 /cluster/data/mm8/bed/bacends/bacEnds.psl

    ## work at top-level directory after this
    mkdir /cluster/data/mm9/bacends
    cp -p bacEnds.psl /cluster/data/mm9/bacends

############################################################################
# BACEND PAIRS TRACK (DONE - 2007-08-06 - Hiram)

    ssh kolossus
    cd /cluster/data/mm9/bacends

    time nice -n +19 pslPairs -tInsert=10000 -minId=0.91 -noBin -min=25000 \
	-max=350000 -slopval=10000 -hardMax=500000 -slop -short -long -orphan \
	-mismatch -verbose bacEnds.psl \
	../bed/cloneend/cloneEndPairs.txt all_bacends bacEnds
    #	real    0m49.120s
    ## produces files:
    #	-rw-rw-r--  1     199185 Aug  6 14:46 bacEnds.slop
    #	-rw-rw-r--  1     144486 Aug  6 14:46 bacEnds.short
    #	-rw-rw-r--  1   24399410 Aug  6 14:46 bacEnds.pairs
    #	-rw-rw-r--  1   25421100 Aug  6 14:46 bacEnds.orphan
    #	-rw-rw-r--  1     201794 Aug  6 14:46 bacEnds.mismatch
    #	-rw-rw-r--  1      15928 Aug  6 14:46 bacEnds.long

    # create header required by "rdb" tools
    echo -e \
"chr\tstart\tend\tclone\tscore\tstrand\tall\tfeatures\tstarts\tsizes" > header
    echo -e "10\t10N\t10N\t10\t10N\t10\t10\t10N\t10\t10" >> header

    cat header bacEnds.pairs | \
	/cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairs.bed
    #	-rw-rw-r--  1   24201067 Aug  6 14:49 bacEndPairs.bed

    cat header bacEnds.slop bacEnds.short bacEnds.long bacEnds.mismatch \
	bacEnds.orphan | /cluster/bin/scripts/row score ge 300 | \
	/cluster/bin/scripts/sorttbl chr start | \
	/cluster/bin/scripts/headchg -del > bacEndPairsBad.bed
    #	-rw-rw-r--  1    6888559 Aug  6 14:49 bacEndPairsBad.bed

    /cluster/bin/scripts/extractPslLoad -noBin bacEnds.psl bacEndPairs.bed \
	bacEndPairsBad.bed >j1.out
    #	-rw-rw-r--  1  989173324 Aug  6 14:52 j1.out
    cat j1.out | /cluster/bin/scripts/sorttbl tname tstart >j2.out
    #	-rw-rw-r--  1  989173324 Aug  6 15:07 j2.out
    cat j2.out | /cluster/bin/scripts/headchg -del > bacEnds.load.psl
    #	-rw-rw-r--  1  989173165 Aug  6 15:08 bacEnds.load.psl

    rm j1.out j2.out

    #	CHECK bacEndPairs.bed ID's to make sure they have no blanks in them
    awk '{print $5}' bacEndPairs.bed | sort -u
    #	result should be the scores, no extraneous strings:
#	1000
#	300
#	375
#	500
#	750
    #	edit the file and fix it if it has a bad name.
    wc -l bacEnds.load.psl /cluster/data/mm8/bacends/bacEnds.load.psl
    #	8167555 bacEnds.load.psl
    #	8132116 /cluster/data/mm8/bacends/bacEnds.load.psl

    # load into database
    ssh hgwdev
    cd /cluster/data/mm9/bacends
    hgLoadBed -notItemRgb mm9 bacEndPairs bacEndPairs.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	Loaded 239101 elements of size 11

    # note - this track isn't pushed to RR, just used for assembly QA
    hgLoadBed -notItemRgb mm9 bacEndPairsBad bacEndPairsBad.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairsBad.sql
    #	Loaded 84679 elements of size 11

    # NOTE: truncates file to 0 if -nobin is used
    time hgLoadPsl mm9 -table=all_bacends bacEnds.load.psl
# load of all_bacends did not go as planned: 8167555 record(s), 0 row(s)
# skipped, 2 warning(s) loading psl.tab
#	real    4m1.142s
    ## to find out what the warnings are about:
    ## first, on hgwdev, dump the loaded table
    hgsql -N -e "select qName from all_bacends;" mm9 \
	| sort -u > all_bacends.qName.txt
    ## then on kkstore06 compare the resulting load with the requested load file
    diff psl.tab mm9.all_bacends.txt
    ## this diff shows two markers had their qBaseInsert count changed from
    ##	a negative number to a zero since that field is an unsigned
    ## AG326808 and AG609381

    ## joinerCheck should be clean:
    joinerCheck -keys -identifier=bacEndNames -database=mm9 all.joiner
# Checking keys on database mm9
#  mm9.bacEndPairs.lfNames - hits 478202 of 478202 ok

    featureBits mm9 all_bacends
# 349085662 bases of 2620346127 (13.322%) in intersection
    featureBits mm8 all_bacends
# 327086559 bases of 2567283971 (12.741%) in intersection
    featureBits mm7 all_bacends
# 334161740 bases of 2583394090 (12.935%) in intersection
    featureBits mm6 all_bacends
# 336981828 bases of 2597150411 (12.975%) in intersection
    featureBits mm5 all_bacends
# 268502414 bases of 2615483787 (10.266%) in intersection
    featureBits mm4 all_bacends
# 243096171 bases of 2627444668 (9.252%) in intersection

    featureBits mm9 bacEndPairs
# 209909804 bases of 2620346127 (8.011%) in intersection
    featureBits mm8 bacEndPairs
# 2572527283 bases of 2567283971 (100.204%) in intersection
    featureBits mm7 bacEndPairs
# 2578837424 bases of 2583394090 (99.824%) in intersection
    featureBits mm6 bacEndPairs
# 2570768812 bases of 2597150411 (98.984%) in intersection
    featureBits mm5 bacEndPairs
# 2567958504 bases of 2615483787 (98.183%) in intersection
    featureBits mm4 bacEndPairs
# 2549945356 bases of 2627444668 (97.050%) in intersection

    featureBits mm9 bacEndPairsBad
# 48850302 bases of 2620346127 (1.864%) in intersection

#######################################################################
#  Special one-off bacEnds added (DONE - 2008-01-09 - Hiram)
    ssh hgwdev
    # BAC RP23-473N24 was reported missing
    #	its two ends are AZ095043 and AZ095046
    #	end AZ095046 maps just fine to the correct location on chr7
    #	the end AZ095043 does not map correctly when using the -ooc
    #	option to blat.  Run the blat without ooc and it does the
    #	correct thing.  From the genbank record:
    cd /cluster/data/mm9/bed/bacends
    cat << '_EOF_' > AZ095043.fa
>AZ095043
TTTATCATGAATGGGTGTTGTATCTTGTCGAAGCTTTTTCCGCATCTAACGAGATGATCATGTGGTTTTT
GTCTTTGAGTTTGTTTATATAATGGATTACATTGATGGATTTTCATATATTAAACCATCCCTGCATCCCT
GGAATAAAACCTACTTGGTCAGGATGGATGACTGCCAAGGCGGACCGGG
'_EOF_'
    blat /san/sanvol1/scratch/mm9/noMask/chr7.fa AZ095043.fa AZ095043.raw.psl
    pslReps -nearTop=0.01 -minCover=0.7 -minAli=0.8 \
	-noIntrons AZ095043.raw.psl AZ095043.psl /dev/null
    #	before adding this one item:
    hgsql -e "select count(*) from all_bacends;" mm9
    #	8167555
    hgLoadPsl -table=all_bacends -append mm9 AZ095043.psl
    #	verify one row added
    hgsql -e "select count(*) from all_bacends;" mm9
    #	8167556
    #	Using the Mm6 records from all_bacends and bacEndPairs as a guide
    #	The bed record for this BAC is therefore:
    cat << '_EOF_' > RP23-473N24.bed
chr7 150015932 150193247 RP23-473N24 1000 - all_bacends 2 150015932,150192880 172,367 AZ095043,AZ095046
'_EOF_'
    #	verify rows before adding this one new row
    hgsql -e "select count(*) from bacEndPairs;" mm9
    #	239101
    #	YOW !  The -oldTable option didn't work !  I'm guessing that with
    #	the -sqlTable argument it became confused
    hgLoadBed -oldTable -notItemRgb mm9 bacEndPairs RP23-473N24.bed \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql
    #	reload everything:
    cat ../../bacends/bacEndPairs.bed RP23-473N24.bed \
	| hgLoadBed -notItemRgb mm9 bacEndPairs stdin \
	-sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql

#######################################################################
## create random contigs for genscan and other alignment tasks
## DONE - 2007-08-07 - Hiram
    ssh kkstore06
    mkdir randomContigs
    for L in ?/lift/random.lft ??/lift/random.lft
do
    D=${L/\/lift*}
    echo $L $D
    ~/kent/src/hg/utils/lft2BitToFa.pl mm9.2bit ${L} \
	> randomContigs/chr${D}_random.ctg.fa
done
    #
    #	Verify these *.ctg.fa files have the same bases as the ordinary
    #	chr*_random.fa files:
    ## don't have these fasta files yet, extract them from the 2bit
    grep random chrom.sizes | cut -f1 | sed -e "s/^chr//; s/_random//" \
	| while read C
do
    echo "twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa"
    twoBitToFa -seq=chr${C}_random mm9.2bit ${C}/chr${C}_random.fa
done
    ## now we can measure them
    faSize ?/chr?_random.fa ??/chr??_random.fa
    #	70853964 bases (9033771 N's 61820193 real 26427973 upper
    #	35392220 lower) in 13 sequences in 13 files

    ## and our contig versions
    faSize randomContigs/*.ctg.fa
    #	62053964 bases (233771 N's 61820193 real 26427973 upper
    #	35392220 lower) in 189 sequences in 13 files
    ## note, same number of real, upper and lower, only different N's

    ## it would be nice to have the actual chroms too
    grep -v random chrom.sizes | cut -f1 | sed -e "s/^chr//" \
	| while read C
do
    echo "twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa"
    twoBitToFa -seq=chr${C} mm9.2bit ${C}/chr${C}.fa
done
    # measure that result
    faSize ?/chr?.fa ??/chr??.fa
    #	2654911517 bases (96385738 N's 2558525779 real 1438609919
    #	upper 1119915860 lower) in 22 sequences in 22 files
    ## is this the amount of sequence specified in chrom.sizes ?
    grep -v random chrom.sizes | ave -col=2 stdin | grep total
    #	total 2654911517.000000
    ## same number, nothing lost

#########################################################################
# GENSCAN PREDICTIONS (DONE - 2007-08-07 - 2007-08-10 - Hiram)
    ssh kkstore06
    #	Create a 2bit file with the full chrom sequences and the
    #	random contigs, all hard masked
    ## later it was found that chr16_random.ctg.fa should not be in
    ##	this genscan run.  So, it was temporarily taken out of this directory
    ## and this sequence was rerun to avoid it.
    cat ?/chr?.fa ??/chr??.fa randomContigs/chr*.ctg.fa \
	| maskOutFa stdin hard stdout \
	    | faToTwoBit stdin mm9Chroms_RandomContigs.hard.2bit
    #  with chr16_random removed:
    #	2716961487 bases (1251923595 N's 1465037892 real 1465037892 upper 0
    #	lower) in 210 sequences in 1 files

    #	make sure it still has all the unmasked sequence in it: (incl 16)
    twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
	| faSize stdin
    # 2716965481 bases (1251927589 N's 1465037892 real 1465037892 upper
    #	0 lower) in 211 sequences in 1 files
    twoBitToFa mm9.2bit stdout | faSize stdin
    # 2725765481 bases (105419509 N's 2620345972 real 1465037892 upper
    # 1155308080 lower) in 35 sequences in 1 files
    #	note the upper bases are the same, the lowers have become N's
    #	lower 1155308080 + upper 1465037892 = 2620345972 real
    #	N's 1251927589 - N's 105419509  = 1146508080 ==
    #		N's in gaps between contigs

    #	And, make sure there aren't any sequences in this lot that have
    #	become all N's with no sequence left in them.  This drives genscan nuts
    twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
	| faCount stdin > chroms_randoms.faCount
    #	the lowest three are:
    egrep -v "^#|^total" chroms_randoms.faCount \
	| awk '{print $1,$2-$7}' | sort -k2,2nr | tail -3
    #	NT_166474.1 75
    #	NT_166461.1 66
    #	NT_166481.1 39
    #	NT_166325.1 0
    ## This last one is the entire chr16_random and it is only 3,994 bases
    ## long and is marked entirely by RepeatMasker as a line.  It needs
    ## to be skipped during the run of genscan.  Go back to the 2bit creation
    ## and do not include chr16_random

    #	creating 4,000,000 sized chunks, the chroms stay together as
    #	single pieces.  The contigs get grouped together into 4,000,000
    #	sized fasta files.  You don't want to break these things up
    #	because genscan will be doing its own internal 2.4 million
    #	window on these pieces, and the gene names are going to be
    #	constructed from the sequence name in these fasta files.  The
    #	gene names are much better when they are this simple chrN.M
    #	numbering scheme, or in the case of a contig: contig_name.M
    #	where the M is a sequence number that genscan will assign to
    #	each gene it discovers.
    mkdir hardChunks
    twoBitToFa mm9Chroms_RandomContigs.hard.2bit stdout \
	| faSplit about stdin 4000000 hardChunks/c_
    ssh kkr1u00
    mkdir /iscratch/i/mus/mm9/hardChunks
    cd /iscratch/i/mus/mm9/hardChunks
    rsync -a --progress /cluster/data/mm9/hardChunks/ .
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/hardChunks/
done

    ssh hgwdev
    mkdir /cluster/data/mm9/bed/genscan
    cd /cluster/data/mm9/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux

    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/mm9/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    #	Since we split on gaps, we have no chunks like that.  You can
    #	verify with faCount on the chunks.
    ls -1Sr /iscratch/i/mus/mm9/hardChunks/c_*.fa > genome.list

    ## for next time, this isn't a parasol safe method of operation.
    ## if genscan is writing answers to gtf/ pep/ and subopt/ during
    ##	its operation and it fails. parsol wouldn't be able to verify that
    ##	it was complete merely by file existence check.  This should work
    ##	in scratch/tmp entirely, then copy results back after it is done.
    # Create template file, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 genome.list single template jobList
    para create jobList
    para try, check, push, check, ...
# Completed: 35 of 36 jobs
# CPU time in finished jobs:     279581s    4659.68m    77.66h    3.24d  0.009 y
# IO & Wait Time:                  3390s      56.50m     0.94h    0.04d  0.000 y
# Average job time:                8085s     134.75m     2.25h    0.09d
# Longest finished job:           32422s     540.37m     9.01h    0.38d
# Submission to last job:        122301s    2038.35m    33.97h    1.42d

    #	There was a failed job, going to kolossus and running it again,
    #	it takes a very long time, and fails with this cryptic error:
    #	No overlap between a and b in mergeTwo
    ssh kolossus
    cd /cluster/data/mm9/bed/genscan
    time /cluster/bin/x86_64/gsBig /iscratch/i/mus/mm9/hardChunks/c_06.fa \
        gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
        -exe=hg3rdParty/genscanlinux/genscan \
        -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
        -window=2400000
    #	real    922m2.382s
    #	run it with a reduced window size to see if it will complete
    time nice -n +19 /cluster/bin/x86_64/gsBig \
	/iscratch/i/mus/mm9/hardChunks/c_06.fa \
        gtf/c_06.gtf -trans=pep/c_06.pep -subopt=subopt/c_06.bed \
        -exe=hg3rdParty/genscanlinux/genscan \
        -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp \
        -window=2000000
    #	real    648m24.682s
    ## that one failed too, with an error:
# /scratch/tmp/temp_gsBig_10943_chr7_38.genscan is not a GENSCAN output file
    ## and the contents of that file said:
# Insufficient memory error: results may be unreliable.
# Try running program an a portion of sequence.
    #	Let's try splitting up this chr7 on the gaps, which there are plenty
    #	of in this hard masked sequence.  Ended up breaking the chr7 sequence
    #	with the non bridged lift file.  See the lft2BitToFa.pl file in
    #	the chr7_split directory.
    #	on kkstore06
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/genscan/chr7_split
    cd /cluster/data/mm9/bed/genscan/chr7_split
    ./lft2BitToFa.pl ../../../mm9.2bit *.lft > chr7.contigs.hard.fa

    mkdir /cluster/data/mm9/bed/genscan/chr7_run
    cd /cluster/data/mm9/bed/genscan/chr7_run
    mkdir split
    faSplit sequence ../chr7_split/chr7.contigs.hard.fa 100 split/chr7_

    ## Now, on the small kluster
    ssh kki
    cd /cluster/data/mm9/bed/genscan/chr7_run
    mkdir gtf pep subopt

    # Create template file, for gensub2.  For example (3-line file):
    cat << '_EOF_' > template
#LOOP
/cluster/bin/x86_64/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=../hg3rdParty/genscanlinux/genscan -par=../hg3rdParty/genscanlinux/HumanIso.smat -tmp=/scratch/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << happy emacs
    ls -1S split/chr7_*.fa > chr7.list
    gensub2 chr7.list single template jobList
    para create jobList
    para try ... check ... push ... etc...
# Completed: 15 of 15 jobs
# CPU time in finished jobs:       4226s      70.43m     1.17h    0.05d  0.000 y
# IO & Wait Time:                   215s       3.59m     0.06h    0.00d  0.000 y
# Average job time:                 296s       4.93m     0.08h    0.00d
# Longest finished job:             861s      14.35m     0.24h    0.01d
# Submission to last job:           861s      14.35m     0.24h    0.01d
    # lift these chr7 results into a single file,
    #	fixup the gene names with the sed to remove the lift name effect
    ssh kkstore06
    cd /cluster/data/mm9/bed/genscan/chr7_run
    cat gtf/chr7_*.gtf | liftUp -type=.gtf stdout \
	../chr7_split/nonBridgedChr7.lft error stdin \
	| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.gtf
    cat subopt/chr7_*.bed | liftUp -type=.bed stdout \
	../chr7_split/nonBridgedChr7.lft error stdin \
	| sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.subopt.bed
    cat pep/chr7_*.pep | sed -e "s/chr7\.\([0-9][0-9]*\)\./chr7.\1/g" > chr7.pep
    ## these results become the c_06 results in the main run
    cp -p chr7.pep ../pep/c_06.pep
    cp -p chr7.subopt.bed ../subopt/c_06.bed
    cp -p chr7.gtf ../gtf/c_06.gtf

    ## after the chr7 business above, back to the mainline processing
    # cat and lift the results into single files
    ssh kkstore06
    cd /cluster/data/mm9/bed/genscan
    cat gtf/c_*.gtf | liftUp -type=.gtf genscan.gtf \
	../../jkStuff/mm9.contigs.lift carry stdin
    cat subopt/c_*.bed | liftUp -type=.bed genscanSubopt.bed \
	../../jkStuff/mm9.contigs.lift carry stdin
    cat pep/c_*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/mm9/bed/genscan
    ldHgGene mm9 -gtf genscan genscan.gtf
    #	Read 45189 transcripts in 324075 lines in 1 files
    #	45189 groups 34 seqs 1 sources 1 feature types
    #	45189 gene predictions

    hgPepPred mm9 generic genscanPep genscan.pep
    hgLoadBed mm9 genscanSubopt genscanSubopt.bed
    #	Loaded 525904 elements of size 6

    #	check the numbers
    time nice -n +19 featureBits mm9 genscan
    #	55293837 bases of 2620346127 (2.110%) in intersection
    time nice -n +19 featureBits mm8 genscan
    #	54455852 bases of 2567283971 (2.121%) in intersection
    time nice -n +19 featureBits mm8 knownGene:cds
    #	28459053 bases of 2567283971 (1.109%) in intersection
    featureBits mm7 genscan
    #	54864694 bases of 2583394090 (2.124%) in intersection
    time nice -n +19 featureBits mm7 knownGene:cds
    #	27531524 bases of 2583394090 (1.066%) in intersection

    featureBits mm9 genscanSubopt
    #	57044145 bases of 2620346127 (2.177%) in intersection
    featureBits mm8 genscanSubopt
    #	57048581 bases of 2567283971 (2.222%) in intersection
    featureBits mm7 genscanSubopt
    #	57512333 bases of 2583394090 (2.226%) in intersection
    featureBits mm6 genscanSubopt
    #	57856316 bases of 2597150411 (2.228%) in intersection
    featureBits mm5 genscanSubopt
    #	58474899 bases of 2615483787 (2.236%) in intersection
    featureBits mm4 genscanSubopt
    #	59601009 bases of 2627444668 (2.268%) in intersection
    featureBits mm3 genscanSubopt
    #	56085184 bases of 2505900260 (2.238%) in intersection

#############################################################################
# BLASTZ SELF (DONE - 2007-08-07 - 2007-08-31 - Hiram)
#	using chain min score of 10,000 to cut down on volumn of data
#  trying a two pass sequence, chroms with chroms, then randoms to chroms
#  swap the randoms, then combine the three results into a final set
    ssh kkstore06
    cd /cluster/data/mm9
    time nice -n +19 faToTwoBit ?/chr?.fa ??/chr??.fa mm9.chroms.2bit
    time nice -n +19 faToTwoBit randomContigs/chr*.ctg.fa mm9.randomContigs.2bit

    ssh kkr1u00
    cd /iscratch/i/mus/mm9
    cp -p /cluster/data/mm9/mm9.chroms.2bit .
    cp -p /cluster/data/mm9/mm9.randomContigs.2bit .
    twoBitInfo mm9.chroms.2bit mm9.chroms.sizes
    twoBitInfo mm9.randomContigs.2bit mm9.randomContgs.sizes
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
done

    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07

    cat << '_EOF_' > DEF
# mouse vs mouse
BLASTZ_H=2000
BLASTZ_M=200

# TARGET: Mouse Mm9
SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm9
SEQ2_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    ## run this in a screen on kkstore06
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
	-stop=load `pwd`/DEF > blastz.out 2>&1 &
    #	This was a tricky one to complete.  A situation was fixed in the
    #	blastz-run-ucsc script which may have helped, but then there were
    #	32 jobs that would only complete on the kki kluster.  The kk nodes
    #	complained about running out of memory.  After a completed run was
    #	finished, and verified:
    ssh kkstore06
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/psl
    find . -type f | wc -l
    #	77284
    wc -l ../run.blastz/jobList
    #	wc -l ../run.blastz.jobList
    #	finished the rest by continuing at the 'cat' step:
    time doBlastzChainNet.pl -verbose=2 \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
	-continue=cat -stop=load `pwd`/DEF > cat.out 2>&1 &
    #	real    285m33.094s
    #  failed during the load because of the SEQ?_LEN specification pointing
    #	to /iscratch/i which is not available on hgwdev.  So, only use
    #	the primary /cluster/data/mm9/chrom.sizes for the DEF file in the future
    #  ran the load step manually to complete with the loadUp.csh fixed.

    ssh kolossus
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07
    time nice -n +19 featureBits mm9 chainSelfLink \
	> fb.mm9.chainSelfLink.noRandoms.txt 2>&1
    #	real    24m54.883s
    cat fb.mm9.chainSelfLink.noRandoms.txt
    #	323062218 bases of 2620346127 (12.329%) in intersection

    cd /cluster/data/mm9/bed
    ln -s blastzSelf.2007-08-07 blastz.mm9
    ## prepare 2bit file of only the randoms
    ssh kkstore06
    cd /cluster/data/mm9
    faToTwoBit ?/chr?_random.fa ??/chr??_random.fa mm9.randoms.2bit
    # and the sizes files
    twoBitInfo mm9.randomContigs.2bit mm9.randomContigs.sizes
    twoBitInfo mm9.randoms.2bit mm9.randoms.sizes
    # a cluster run for just these bits of sequence
    mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
    cat << '_EOF_' > DEF
# mouse vs mouse randoms
PATH=/cluster/bin/penn/x86_64:/cluster/bin/penn:/cluster/bin/scripts:/cluster/bin/x86_64:/bin:/usr/bin

BLASTZ_H=2000
BLASTZ_M=200

# TARGET: Mouse Mm9
SEQ1_DIR=/iscratch/i/mus/mm9/mm9.chroms.2bit
SEQ1_LEN=/cluster/data/mm9/mm9.chroms.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Mouse Mm9 randoms only
SEQ2_DIR=/cluster/data/mm9/mm9.randoms.2bit
SEQ2_LEN=/cluster/data/mm9/mm9.randoms.sizes
SEQ2_CTGDIR=/cluster/data/mm9/mm9.randomContigs.2bit
SEQ2_CTGLEN=/cluster/data/mm9/mm9.randomContigs.sizes
SEQ2_LIFT=/cluster/data/mm9/jkStuff/mm9.contigs.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=20
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsOnly
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -ignoreSelf \
	-chainMinScore=10000 -chainLinearGap=medium -bigClusterHub=kk \
	-stop=net `pwd`/DEF > blastz.out 2>&1 &
    #	now swap the primary chroms back to the randoms
    mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/randomsSwap
    chainSwap ../randomsOnly/axtChain/mm9.mm9.all.chain.gz stdout \
        | nice chainSort stdin stdout | nice gzip -c \
        > mm9.mm9.all.chain.gz

    #	And then combine all three sets together
    mkdir /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
    cd /cluster/data/mm9/bed/blastzSelf.2007-08-07/allTogetherNow
    chainSplit chain ../axtChain/mm9.mm9.all.chain.gz \
        ../randomsOnly/axtChain/mm9.mm9.all.chain.gz \
        ../randomsSwap/mm9.mm9.all.chain.gz
    #	get them sorted by score correctly
    mkdir chainSort
for F in `(cd chain; ls *.chain)`
do
    echo $F
    chainSort chain/$F chainSort/$F
done
    #	re-number the chains consistently
    chainMergeSort chainSort/*.chain | nice gzip -c > mm9.mm9.all.chain.gz
    rm -fr chain
    mv chainSort chain
    #  and for loading, split this consistently numbered set
    rm -fr chain
    time nice -n +19 chainSplit chain mm9.mm9.all.chain.gz
    #	real    5m0.666s
    ## using a manually fixed up netChains.csh script:
    time nice -n +19 ./netChains.csh > netChains.out 2>&1
    #	real    147m53.147s
    ssh hgwdev
    ## using a manually fixed up loadUp.csh script:
    #		(from ../axtChain/loadUp.csh)
    time nice -n +19 ./loadUp.csh > loadUp.out 2>&1 &
    #	real    99m17.895s
    time nice -n +19 featureBits mm9 chainSelfLink > fb.mm9.chainSelfLink 2>&1
    #	real    30m3.402s
    #	378849408 bases of 2620346127 (14.458%) in intersection
    cat /cluster/data/mm8/bed/blastzSelf.2006-03-20/fb.mm8.chainSelfLink
    #	362483673 bases of 2567283971 (14.119%) in intersection

    # finish off the nets
    time nice -n +19 netClass -verbose=0 -noAr noClass.net mm9 mm9 mm9.mm9.net
    #	real    1m9.538s
    # load nets (not needed for the RR, but useful on genome-test)
    time nice -n +19 netFilter -minGap=10 mm9.mm9.net \
	| hgLoadNet -verbose=0 mm9 netSelf stdin
    #	real    0m40.709s

    ## We don't deliver this track to the RR, so downloads are not necessary

#############################################################################
# PREPARE LINEAGE SPECIFIC REPEAT FILES FOR BLASTZ (DONE - 2007-08-07 - Hiram)
    ssh kkr1u00
    mkdir /iscratch/i/mus/mm9/rmsk
    cd /cluster/data/mm9
    cp -p */chr*.fa.out /iscratch/i/mus/mm9/rmsk
    cd /iscratch/i/mus/mm9
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
    done
    cd rmsk

    ssh kki
    mkdir /cluster/data/mm9/linSpecRep
    cd /cluster/data/mm9/linSpecRep
    ls -1S /iscratch/i/mus/mm9/rmsk > fa.list
    
    cat << '_EOF_' > mkLSR
#!/bin/csh -fe
pushd /iscratch/i/mus/mm9/rmsk
rm -f $1_homo-sapiens_rattus_canis-familiaris_bos-taurus
/cluster/bluearc/RepeatMasker070517/DateRepeats \
    $1 -query mouse -comp human -comp rat -comp dog -comp cow
popd
/bin/cp -p \
  /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus .
rm -f /iscratch/i/mus/mm9/rmsk/$1_homo-sapiens_rattus_canis-familiaris_bos-taurus
'_EOF_'
    #	<< happy emacs
    chmod +x mkLSR

    cat << '_EOF_' > template
#LOOP
./mkLSR $(path1) {check out line+ $(path1)_homo-sapiens_rattus_canis-familiaris_bos-taurus}
#ENDLOOP
'_EOF_'
    #	<< happy emacs

    gensub2 fa.list single template jobList
    para try ... check ... push ... etc...
    para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs:       1498s      24.96m     0.42h    0.02d  0.000 y
# IO & Wait Time:                   193s       3.22m     0.05h    0.00d  0.000 y
# Average job time:                  48s       0.81m     0.01h    0.00d
# Longest finished job:             102s       1.70m     0.03h    0.00d
# Submission to last job:          3399s      56.65m     0.94h    0.04d

    ssh kkstore06
    cd /cluster/data/mm9/linSpecRep
    mkdir notInHuman notInRat notInDog notInCow notInRabbit
    for F in chr*.out_homo-sapiens*
    do
	B=${F/.fa.out*/}
	echo $B 
        /cluster/bin/scripts/extractRepeats 1 ${F} > \
		notInHuman/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 2 ${F} > \
		notInRat/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 3 ${F} > \
		notInDog/${B}.out.spec
        /cluster/bin/scripts/extractRepeats 4 ${F} > \
		notInCow/${B}.out.spec
    done

    #	the notInHuman, notInDog, and notInCow ended up being
    #	identical.  Only the notInRat was different than them
    #	To check identical
    find . -name "*.out.spec" | \
	while read FN; do echo `cat ${FN} | sum -r` ${FN}; done \
	| sort -k1,1n | sort -t"/" -k3,3
    #	Copy to iscratch for use in kluster runs
    ssh kkr1u00
    mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInRat
    mkdir -p /iscratch/i/mus/mm9/linSpecRep/notInOthers
    cd /iscratch/i/mus/mm9/linSpecRep/notInRat
    cp -p /cluster/data/mm9/linSpecRep/notInRat/* .
    cd /iscratch/i/mus/mm9/linSpecRep/notInOthers
    cp -p /cluster/data/mm9/linSpecRep/notInHuman/* .
    #	copy this directory to the other Iservers
    cd /iscratch/i/mus/mm9
    for R in 2 3 4 5 6 7 8
do
    rsync -a --progress ./ kkr${R}u00:/iscratch/i/mus/mm9/
done

    #	and we can do the Iservers simply:
    ssh kkr1u00
    cd /iscratch/i/mm9
    #	no longer need these two directories
    rm -fr fa rmsk
    rsync -a --progress /cluster/bluearc/scratch/hg/mm9/ .
    for R in 2 3 4 5 6 7 8
    do
	rsync -a --progress /iscratch/i/mm9/ kkr${R}u00:/iscratch/i/mm9/
    done

    # We also need the nibs for blastz runs with lineage specific repeats
    ssh kkstore06
    mkdir /cluster/data/mm9/nib
    cd /cluster/data/mm9
    for FA in ?/chr*.fa ??/chr*.fa
do
    F=${FA/*\//}
    F=${F/.fa/}
    echo faToNib -softMask ${FA} nib/${F}.nib
    faToNib -softMask ${FA} nib/${F}.nib
done
    #  copied to /cluster/bluearc/scratch/data/mm9/nib/
    #  and everything else we will need for kluster runs into
    #	/cluster/bluearc/scratch/data/mm9/
    # Ask cluster-admin to sync /scratch/ filesystem to kluster nodes

#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-09 - 2007-08-15 - Hiram)
#  re-run a second time with tighter parameters, see below for second run
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-09
    cd /cluster/data/mm9/bed/blastzRn4.2007-08-09
    #	Started this before the rsync to /scratch/data/mm9/ had completed,
    #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
    #	here.

    cat << '_EOF_' > DEF
# mouse vs rat
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm9
SEQ1_DIR=/cluster/bluearc/scratch/data/mm9/nib
SEQ1_SMSK=/cluster/bluearc/scratch/data/mm9/notInRat
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/iscratch/i/rn4/nib
SEQ2_SMSK=/iscratch/i/rn4/linSpecRep.notInMouse
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-09
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    ## had to fix the blast-run-ucsc script to get these to complete.
    # the chr16_random sequence was causing problems because it has no usable
    # sequence in it for blastz to work with.  And finally, two jobs needed to
    # be run manually on kolossus, don't know what happened with them,
    # although their output was immense:
# -rw-rw-r--  1 15054644 Aug 14 10:22 chr2.nib:chr2:80000000-90010000_chr7.nib:chr7:0-10000000.psl
# -rw-rw-r--  1 18992595 Aug 14 11:02 chr2.nib:chr2:80000000-90010000_chr3.nib:chr3:70000000-80000000.psl
    #	I suspect there is something going on with large results and running on
    #	the kk nodes.  I'm getting the same trouble with the self blastz.

    #  then, continuing with the cat
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=cat `pwd`/DEF > cat.out 2>&1 &
    # real    239m51.356s
    cat fb.mm9.chainRn4Link.txt
    #	1791195056 bases of 2620346127 (68.357%) in intersection
    cat /cluster/data/mm8/bed/blastz.rn4/fb.mm8.chainRn4Link
    #	1770319811 bases of 2567283971 (68.957%) in intersection
    cd /cluster/data/mm9/bed
    ln -s blastzRn4.2007-08-09 blastz.rn4

    mkdir /cluster/data/rn4/bed/blastz.mm9.swap
    cd /cluster/data/rn4/bed/blastz.mm9.swap
    time ~/kent/src/hg/utils/automation/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-swap /cluster/data/mm9/bed/blastzRn4.2007-08-09/DEF > swap.out 2>&1 &
    #	real    209m11.032s

    cd /cluster/data/rn4/bed
    ln -s blastz.mm9.swap blastz.mm9
    cat /cluster/data/rn4/bed/blastz.mm9/fb.rn4.chainMm9Link.txt
    #	1788261968 bases of 2571531505 (69.541%) in intersection
    cat  /cluster/data/rn4/bed/blastz.mm8/fb.rn4.chainMm8Link.txt
    #	1791093685 bases of 2571531505 (69.651%) in intersection

#########################################################################
## multiple alignment preparation stats
#	The following table will keep track of the pairwise alignments
#	completed.  (The % NN.Nxx mean not done yet)
#                         featureBits chainLink measures
#                                           chainMm9Link   chain   linearGap
#    distance                       on Mm9      on other   minScore
#  1  0.1587 - rat rn4            (% 68.357)  (% 69.541)   3000     medium
#  2  0.4677 - human hg18         (% 38.499)  (% 35.201)   3000     medium

#  3  0.4686 - chimp panTro2      (% 37.5xx)  (% 33.6xx)   3000     medium
#  4  0.4960 - macaque rheMac2    (% 34.7xx)  (% 33.1xx)   3000     medium
#  5  0.5131 - rabbit oryCun1     (% 19.3xx)  (no swap )   3000     medium
#  6  0.6142 - armadillo dasNov1  (% 16.8xx)  (no swap )   3000     medium
#  7  0.6230 - dog canFam2        (% 32.2xx)  (% 34.2xx)   3000     medium
#  8  0.6256 - elephant loxAfr1   (% 18.3xx)  (no swap )   3000     medium
#  9  0.6344 - cow bosTau2        (% 26.8xx)  (% 24.2xx)   3000     medium
# 10  0.7805 - tenrec echTel1     (% 11.4xx)  (no swap )   5000     loose
# 11  1.0698 - opossum monDom4    (%  8.2xx)  (%  6.0xx)   5000     loose
# 12  1.3425 - chicken galGal2    (%  2.5xx)  (%  5.4xx)   5000     loose
# 13  1.7936 - frog xenTro2       (%  2.6xx)  (%  5.3xx)   5000     loose
# 14  2.0157 - tetraodon tetNig1  (%  1.9xx)  (% 13.7xx)   5000     loose
# 15  2.0562 - fugu fr1           (%  1.9xx)  (% 13.5xx)   5000     loose
# 16  2.1059 - zebrafish danRer5  (%  2.1xx)  (%  3.5xx)   5000     loose

##########################################################################
## BLASTZ SWAP from Hg18 to Mm9 (DONE - 2007-08-15 - Hiram)
    #	also in hg18.txt
    cat /cluster/data/hg18/bed/blastzMm9.2007-08-09/fb.hg18.chainMm9Link.txt
    #	1014323175 bases of 2881515245 (35.201%) in intersection

    #	Then to swap over to Mm9
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastz.hg18.swap
    cd /cluster/data/mm9/bed/blastz.hg18.swap
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -swap -bigClusterHub=pk -chainMinScore=3000 \
	-chainLinearGap=medium \
	/cluster/data/hg18/bed/blastz.mm9/DEF > swap.out 2>&1 &
    #	real    67m21.146s
    cat /cluster/data/mm9/bed/blastz.hg18.swap/fb.mm9.chainHg18Link.txt
    #	1008812599 bases of 2620346127 (38.499%) in intersection
    cat /cluster/data/mm8/bed/blastz.hg18/fb.mm8.chainHg18Link
    #	984380268 bases of 2567283971 (38.343%) in intersection

    cd /cluster/data/mm9/bed
    ln -s blastz.hg18.swap blastz.hg18

    ## make swapped syntenic net
    cd /cluster/data/mm9/bed/blastz.hg18.swap
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
	-swap -syntenicNet -chainLinearGap=medium -continue=syntenicNet \
	/cluster/data/hg18/bed/blastz.mm9/DEF > syntenic.out 2>&1 &
    ##	real    20m49.712s

#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-30 - Hiram)
#  re-run this second time with tighter parameters
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-30
    cd /cluster/data/mm9/bed/blastzRn4.2007-08-30
    #	Started this before the rsync to /scratch/data/mm9/ had completed,
    #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
    #	here.

    cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller

BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=150
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/human_chimp.v2.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-stop=net \
	`pwd`/DEF > blastz.out 2>&1 &
    #  this runs much faster than the usual blastz run
    #	failed when it got to the kki run since /scratch/hg/rn4/ was not
    #	complete on the Iservers.  Fixup that, then, continue:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-continue=chainMerge -stop=net \
	`pwd`/DEF > chainMerge.out 2>&1 &
    #	And then, kolossus had no /scratch/data/ directory, go there and
    #	make this a symlink to /iscratch/data/
    #	and run the axtChain/netChains.csh script manually on kolossus

#########################################################################
# BLASTZ/CHAIN/NET oryLat1 (DONE - 2007-08-31 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/mm9/bed/blastzOryLat1.2007-08-30
    cd /cluster/data/mm9/bed/blastzOryLat1.2007-08-30

    cat << '_EOF_' > DEF
# mouse vs medaka

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Medaka oryLat1 (40M chunks covers the largest chroms in one gulp)
#       chrUn in Scaffolds for this alignment run
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=40000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzOryLat1.2007-08-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    512m56.909s
    #  had a single failed kk job, finished manually, then:
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	real    11m5.508s
    ## typical failure:
    #	HgStepManager: executing step 'net' Fri Aug 31 10:02:51 2007.
    #	netChains: looks like previous stage was not successful (can't find [mm9.oryLat1.]all.chain[.gz]).
    # continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
    #	real    21m33.501s
    cat fb.mm9.chainOryLat1Link.txt
    #	50650171 bases of 2620346127 (1.933%) in intersection

    # and the swap
    mkdir /cluster/data/oryLat1/bed/blastz.mm9.swap
    cd /cluster/data/oryLat1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzOryLat1.2007-08-30/DEF \
	-chainMinScore=5000 -qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    cat fb.oryLat1.chainMm9Link.txt
    #	45488232 bases of 700386597 (6.495%) in intersection

#########################################################################
# LOAD ACEMBLY (DONE 9/17/07 angie)
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/acembly
    cd /cluster/data/mm9/bed/acembly
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.genes_gff.tar.gz
    wget --timestamping ftp://ftp.ncbi.nih.gov/repository/acedb/ncbi_37_Sep07.mouse.genes/AceView.mm_37.good_proteins_fasta.tar.gz
    tar xvzf AceView.mm_37.genes_gff.tar.gz
    tar xvzf AceView.mm_37.good_proteins_fasta.tar.gz

    cd AceView.mm_37.genes_gff
    # If the result of this command is > 0, then some lines have end < start 
    # and need to be fixed:
    awk '$5 < $4 {print;}' *.gff | wc -l
#0

    # Add "chr" prefix:
    sed -e 's/^/chr/;' x1*.gff > acembly.gff

    # Extract annotation types from original gff:
    perl -wpe 's/^.*Gene_type (\w+); transcript_id (\S+);.*/$2\t$1/; \
                 s/Main$/main/ || s/Putative$/putative/ || \
                   die "Unrecognized class:\n$_\n";' *.gff \
    | sort -u \
      > acemblyClass.tab

    # Keep tabs on the transcript names that end in -unspliced --
    # the first time around, had to add that suffix to some protein names
    # in order to get all of them to match.  runJoiner is the real test.
    grep unspliced acemblyClass.tab | wc -l
#54774

    # Pare down proteins to just the ones that we have transcripts for:
    cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
    awk '{print $1;}' ../AceView.mm_37.genes_gff/acemblyClass.tab \
      > transcriptNames.txt
    cat *.fasta \
    | faSomeRecords stdin transcriptNames.txt acemblyPep.fa
    grep unspliced acemblyPep.fa | wc -l
#45033
    # Danielle Thierry-Mieg explained that noncoding genes are included so
    # the number of proteins can be smaller than the number of transcripts.

    # Load tables
    ssh hgwdev
    cd /cluster/data/mm9/bed/acembly/AceView.mm_37.genes_gff
    ldHgGene -gtf mm9 acembly acembly.gff
#Read 173008 transcripts in 2366104 lines in 1 files
#  173008 groups 21 seqs 1 sources 5 feature types
    hgLoadSqlTab mm9 acemblyClass ~/kent/src/hg/lib/acemblyClass.sql \
      acemblyClass.tab
    cd /cluster/data/mm9/bed/acembly/AceView.mm_37.good_proteins_fasta
    hgPepPred mm9 generic acemblyPep acemblyPep.fa
    rm acemblyPep.tab
    runJoiner.csh mm9 acembly
# mm9.acemblyPep.name - hits 149560 of 149560 ok
# mm9.acemblyClass.name - hits 173008 of 173008 ok


#########################################################################
# BLASTZ RAT Rn4 (DONE - 2007-08-30 - 2007-09-11 - Hiram)
#  re-run this third time with a special matrix from Bob Harris/Webb Miller
    cat /cluster/data/blastz/mouse_rat.q
    A    C    G    T
    56 -109  -45 -137
  -109  100 -103  -45
   -45 -103  100 -109
  -137  -45 -109   56
O=600 E=55

    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastzRn4.2007-08-31
    cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
    #	Started this before the rsync to /scratch/data/mm9/ had completed,
    #	hence the /cluster/bluearc/scratch/data/mm9/ location is used
    #	here.

    cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller

BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=55
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/cluster/data/blastz/mouse_rat.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn4 - single chunk big enough to run each chrom by itself
SEQ2_DIR=/scratch/hg/rn4/rn4.2bit
SEQ2_LEN=/cluster/data/rn4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzRn4.2007-08-31
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    cd /cluster/data/mm9/bed/blastzRn4.2007-08-31
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-stop=net `pwd`/DEF > blastz.out 2>&1 &
    #	real    243m51.078s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-continue=download -stop=download `pwd`/DEF > download.out 2>&1 &
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-continue=cleanup -syntenicNet `pwd`/DEF > syntenicNet.out 2>&1 &
    cat fb.mm9.chainRn4Link.txt
    #	1713186474 bases of 2620346127 (65.380%) in intersection

    #	and the swap
    mkdir /cluster/data/rn4/bed/blastz.mm9.swap
    cd /cluster/data/rn4/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzRn4.2007-08-31/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=medium \
	-swap -syntenicNet > swap.out 2>&1 &
    #	real    314m59.840s
    cat  fb.rn4.chainMm9Link.txt
    #	1711034941 bases of 2571531505 (66.538%) in intersection

#########################################################################
# EXONIPHY MM9, lifted from hg18 (DONE - 2007-09-05 - Hiram)
#	needed for uscsGenes10 building
    # create a syntenic liftOver chain file
    ssh kolossus
    cd /cluster/data/hg18/bed/blastz.mm9/axtChain
    time nice -n +19 netFilter -syn hg18.mm9.net.gz \
	| netChainSubset -verbose=0 stdin hg18.mm9.all.chain.gz stdout \
	| chainStitchId stdin stdout | gzip -c > hg18.mm9.syn.chain.gz
    #	real    5m55.575s
    #	slightly smaller than the ordinary liftOver chain file:
# -rw-rw-r--  1  77849682 Aug 14 16:49 hg18.mm9.over.chain.gz
# -rw-rw-r--  1  73972671 Sep  5 15:27 hg18.mm9.syn.chain.gz

    # exoniphyMm9.gp is prepared as follows
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/exoniphy
    cd /cluster/data/mm9/bed/exoniphy
    hgsql hg18 -e "select * from exoniphy" -N > exoniphyHg18.gp
    time nice -n +19 liftOver -genePred exoniphyHg18.gp \
	/cluster/data/hg18/bed/blastz.mm9/axtChain/hg18.mm9.syn.chain.gz \
	    exoniphyMm9.gp unmapped
    #	real    52m0.335s
    wc -l *
    #	178162 exoniphyHg18.gp
    #	172859 exoniphyMm9.gp
    #	 10606 unmapped

    ssh hgwdev
    cd /cluster/data/mm9/bed/exoniphy
    nice -n +19 hgLoadGenePred -genePredExt mm9 exoniphy exoniphyMm9.gp
    nice -n +19 featureBits mm9 exoniphy
    #	25931742 bases of 2620346127 (0.990%) in intersection
    nice -n +19 featureBits mm8 exoniphy
    #	25952211 bases of 2567283971 (1.011%) in intersection

#########################################################################
# BLASTZ canFam2 (DONE - 2006-02-18 - Hiram)
    ssh kkstore06
    # establish a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
    cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04

    cat << '_EOF_' > DEF
# mouse vs dog
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Dog CanFam2
SEQ2_DIR=/scratch/hg/canFam2/nib
SEQ2_SMSK=/san/sanvol1/scratch/canFam2/linSpecRep.notInMouse
SEQ2_LEN=/san/sanvol1/scratch/canFam2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCanFam2.2007-09-04
TMPDIR=/scratch/tmp
'_EOF_'
    #	<< happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	`pwd`/DEF > blastz.out 2>&1 &
    #	real    871m24.249s
    cat fb.mm9.chainCanFam2Link.txt
    #	848004408 bases of 2620346127 (32.362%) in intersection

    mkdir /cluster/data/canFam2/bed/blastz.mm9.swap
    cd /cluster/data/canFam2/bed/blastz.mm9.swap

    time /cluster/bin/scripts/doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzCanFam2.2007-09-04/DEF \
	-verbose=2 -bigClusterHub=pk -chainMinScore=3000 \
	-chainLinearGap=medium -swap > swap.out 2>&1 &
    #	real    57m59.126s
    cat fb.canFam2.chainMm9Link.txt
    #	832145360 bases of 2384996543 (34.891%) in intersection

    #	need syntenic net for the multiz
    cd /cluster/data/mm9/bed/blastzCanFam2.2007-09-04
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-syntenicNet -continue=syntenicNet `pwd`/DEF > synNet.out 2>&1 &
    #	real    19m1.302s

#########################################################################
# BLASTZ/CHAIN/NET RHEMAC2 (DONE - 2007-09-05 - Hiram)
    # Won't put this in Conservation -- special request for ancestor recon.
    ssh kkstore06
    #	use a screen to control this job
    # XXX note for next time, naming convention is different here than all the
    # others, and there is a missing TMPDIR in the DEF file
    screen
    mkdir /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
    cd /cluster/data/mm9/bed/blastz.rheMac2.2007-09-05


    cat << '_EOF_' > DEF
# Mouse vs. macacque
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Macacque (rheMac2)
SEQ2_DIR=/san/sanvol1/scratch/rheMac2/nib
SEQ2_SMSK=/cluster/bluearc/rheMac2/linSpecRep/notInRodent
SEQ2_LEN=/cluster/data/rheMac2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
      -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
      -syntenicNet `pwd`/DEF > do.log 2>&1 &
    #	real    1017m13.247s
    # some kk kluster difficulties, fixup and complete manually
# Completed: 87616 of 87616 jobs
# CPU time in finished jobs:   26547195s  442453.25m  7374.22h  307.26d  0.842 y
# IO & Wait Time:               3384143s   56402.38m   940.04h   39.17d  0.107 y
# Average job time:                 342s       5.69m     0.09h    0.00d
# Longest finished job:            3159s      52.65m     0.88h    0.04d
# Submission to last job:         65814s    1096.90m    18.28h    0.76d
    #	then, continuing
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
      -bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
      -continue=cat -syntenicNet `pwd`/DEF > cat.log 2>&1 &
    #	real    255m52.382s
    cat fb.mm9.chainRheMac2Link.txt
    #	998017006 bases of 2620346127 (38.087%) in intersection
    mkdir /cluster/data/rheMac2/bed/blastz.mm9.swap
    cd /cluster/data/rheMac2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastz.rheMac2.2007-09-05/DEF \
	-bigClusterHub=kk -chainMinScore=3000 -chainLinearGap=medium \
	-swap -syntenicNet > swap.log 2>&1 &
    #	real    178m31.911s
    cat fb.rheMac2.chainMm9Link.txt
    #	1094006509 bases of 2646704109 (41.335%) in intersection


#########################################################################
# BLASTZ/CHAIN/NET Orangutan ponAbe1 (DONE - 2007-09-05 - Hiram)
    ssh kkstore01
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
    cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05

    #	next time, have SEQ2_CHUNK at 30000000 and SEQ2_LIMIT at 100
    #	this caused over 500,000 pk jobs, that is too many
    cat << '_EOF_' > DEF
# mouse vs orangutan
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Orangutan ponAbe1
SEQ2_DIR=/scratch/data/ponAbe1/ponAbe1.2bit
SEQ2_LEN=/cluster/data/ponAbe1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
    #	real    897m58.156s
    #	some pk kluster difficulties, fixup and complete manually
Completed: 511290 of 511290 jobs
CPU time in finished jobs:   11448015s  190800.24m  3180.00h  132.50d  0.363 y
IO & Wait Time:               1852197s   30869.96m   514.50h   21.44d  0.059 y
Average job time:                  26s       0.43m     0.01h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             349s       5.82m     0.10h    0.00d
Submission to last job:         54771s     912.85m    15.21h    0.63d
    #	then, continuing
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=cat -stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
    #	ran into trouble on the kki chain run with stuff missing
    #	from the Iservers /scratch/data/ - rsync them up and get
    #	the run done manually
# Completed: 24 of 24 jobs
# CPU time in finished jobs:      17718s     295.30m     4.92h    0.21d  0.001 y
# IO & Wait Time:                   203s       3.38m     0.06h    0.00d  0.000 y
# Average job time:                 747s      12.45m     0.21h    0.01d
# Longest finished job:            3673s      61.22m     1.02h    0.04d
# Submission to last job:          3886s      64.77m     1.08h    0.04d
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=chainMerge -stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > chainMerge.log 2>&1 &
    #	real    55m27.522s
    cat fb.mm9.chainPonAbe1Link.txt
    #	913843325 bases of 2620346127 (34.875%) in intersection

    mkdir /cluster/data/ponAbe1/bed/blastz.mm9.swap
    cd /cluster/data/ponAbe1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPonAbe1.2007-09-05/DEF \
	-stop=load -chainMinScore=3000 \
	-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &

    # create the syntenic maf nets:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=download -syntenicNet -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
    #	real 20m55.024s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzPonAbe1.2007-09-05
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 ponAbe1 \
	> rbest.log 2>&1 &
    #	real    53m43.377s

#########################################################################
# BLASTZ/CHAIN/NET Marmoset calJac1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
    ssh kkstore06
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
    cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06

    #	next time, try SEQ2_CHUNK at 40000000, SEQ2_LIMIT at 75
    #	this created 285,570 kluster jobs, that is too many
    cat << '_EOF_' > DEF
# mouse vs marmoset
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset calJac1
SEQ2_DIR=/scratch/data/calJac1/calJac1.2bit
SEQ2_LEN=/cluster/data/calJac1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCalJac1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
    #	real    897m58.156s
    #	some pk kluster difficulties, fixup and complete manually
# Completed: 511290 of 511290 jobs
# CPU time in finished jobs:   11448015s  190800.24m  3180.00h  132.50d  0.363 y
# IO & Wait Time:               1852197s   30869.96m   514.50h   21.44d  0.059 y
# Average job time:                  26s       0.43m     0.01h    0.00d
# Longest finished job:             349s       5.82m     0.10h    0.00d
# Submission to last job:         54771s     912.85m    15.21h    0.63d
    #	then, continuing
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=cat -stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
    #	real    669m34.473s
    cat fb.mm9.chainCalJac1Link.txt
    #	863961573 bases of 2620346127 (32.971%) in intersection

    mkdir /cluster/data/calJac1/bed/blastz.mm9.swap
    cd /cluster/data/calJac1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
	-stop=load -chainMinScore=3000 \
	-swap -chainLinearGap=medium -bigClusterHub=pk > swap.log 2>&1 &
    #	real    217m10.835s
    cat fb.calJac1.chainMm9Link.txt
    #	887586922 bases of 2929139385 (30.302%) in intersection
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 /cluster/data/mm9/bed/blastzCalJac1.2007-09-06/DEF \
	-continue=download -chainMinScore=3000 \
	-swap -chainLinearGap=medium -bigClusterHub=pk > download.log 2>&1 &
    #	real    1m9.876s

    #	run the syntenic nets
    time nice -n +19 doBlastzChainNet.pl -verbose=2 DEF \
	-continue=download -chainMinScore=3000 \
	-syntenicNet -chainLinearGap=medium -bigClusterHub=pk \
	> syntenicNet.log 2>&1 &
    #	real 22m51.080s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzCalJac1.2007-09-06
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 calJac1 \
	> rbest.log 2>&1 &
    #	real    47m18.467s

#########################################################################
# BLASTZ/CHAIN/NET Fugu fr2 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
    ssh kkstore02
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzFr2.2007-09-06
    cd /cluster/data/mm9/bed/blastzFr2.2007-09-06

    cat << '_EOF_' > DEF
# mouse vs medaka

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Fugu fr2
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/fr2/fr2.2bit
SEQ2_LEN=/san/sanvol1/scratch/fr2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/fr2/fr2.scaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/fr2/fr2.scaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/fr2/liftAll.lft
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzFr2.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    156m55.151s
    # fixup broken kluster jobs, complete manually
# Completed: 70395 of 70395 jobs
# CPU time in finished jobs:    4339015s   72316.91m  1205.28h   50.22d  0.138 y
# IO & Wait Time:                486414s    8106.90m   135.12h    5.63d  0.015 y
# Average job time:                  69s       1.14m     0.02h    0.00d
# Longest finished job:            1098s      18.30m     0.30h    0.01d
# Submission to last job:         18352s     305.87m     5.10h    0.21d
    # and then continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	real    5m43.977s

    #	Still, the typical failure
# HgStepManager: executing step 'net' Thu Sep  6 16:04:56 2007.
# netChains: looks like previous stage was not successful (can't find [mm9.fr2.]all.chain[.gz]).
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
    #	real    178m15.798s
     cat fb.mm9.chainFr2Link.txt
    #	47018710 bases of 2620346127 (1.794%) in intersection

    mkdir /cluster/data/fr2/bed/blastz.mm9.swap
    cd /cluster/data/fr2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzFr2.2007-09-06/DEF \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    #	real    15m32.368s
    cat fb.fr2.chainMm9Link.txt
    #	42413565 bases of 393312790 (10.784%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET Tetraodon tetNig1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
    ssh kkstore01
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzTetNig1.2007-09-06
    cd /cluster/data/mm9/bed/blastzTetNig1.2007-09-06

    cat << '_EOF_' > DEF
# mouse vs tetraodon

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Tetraodon tetNig1
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
SEQ2_DIR=/san/sanvol1/scratch/tetNig1/tetNig1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/tetNig1/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/tetNig1/tetNig1.randomContigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzTetNig1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    535m2.474s
    #	Typical failure
# HgStepManager: executing step 'net' Fri Sep  7 01:13:06 2007.
# netChains: looks like previous stage was not successful (can't find [mm9.tetNig1.]all.chain[.gz]).
    # continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=net -bigClusterHub=kk -verbose=2 > net.log 2>&1 &
    cat fb.mm9.chainTetNig1Link.txt
    #	46206292 bases of 2620346127 (1.763%) in intersection

    mkdir /cluster/data/tetNig1/bed/blastz.mm9.swap
    cd /cluster/data/tetNig1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzTetNig1.2007-09-06/DEF \
	-chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-swap -bigClusterHub=kk > swap.log 2>&1 &
    #	real    19m58.885s
    cat fb.tetNig1.chainMm9Link.txt
    #	42256263 bases of 342403326 (12.341%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET Stickleback gasAcu1 (DONE - 2007-09-06 - 2007-09-07 - Hiram)
    ssh kkstore01
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
    cd /cluster/data/mm9/bed/blastzGasAcu1.2007-09-06

    cat << '_EOF_' > DEF
# mouse vs stickleback

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: stickleback gasAcu1
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=35000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-bigClusterHub=kk -verbose=2 > do.log 2>&1 &
# Completed: 52725 of 52725 jobs
# CPU time in finished jobs:    4110432s   68507.19m  1141.79h   47.57d  0.130 y
# IO & Wait Time:                413069s    6884.49m   114.74h    4.78d  0.013 y
# Average job time:                  86s       1.43m     0.02h    0.00d
# Longest finished job:            1140s      19.00m     0.32h    0.01d
# Submission to last job:         71194s    1186.57m    19.78h    0.82d
    #	had some jobs fail on the kk run, finish manually, then continuing:
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=cat -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	real    120m36.209s
    # failed kki chain job due to san outage on kkr7u00, finished manually:
# Completed: 24 of 24 jobs
# CPU time in finished jobs:       1807s      30.12m     0.50h    0.02d  0.000 y
# IO & Wait Time:                   258s       4.29m     0.07h    0.00d  0.000 y
# Average job time:                  86s       1.43m     0.02h    0.00d
# Longest finished job:             257s       4.28m     0.07h    0.00d
# Submission to last job:          9851s     164.18m     2.74h    0.11d
    #	continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 -verbose=2 \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-continue=chainMerge -bigClusterHub=kk > chainMerge.log 2>&1 &
    #	real    21m7.089s
    cat fb.mm9.chainGasAcu1Link.txt
    #	48448585 bases of 2620346127 (1.849%) in intersection

    mkdir /cluster/data/gasAcu1/bed/blastz.mm9.swap
    cd /cluster/data/gasAcu1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzGasAcu1.2007-09-06/DEF \
	-qRepeats=windowmaskerSdust -chainLinearGap=loose \
	-swap -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    cat fb.gasAcu1.chainMm9Link.txt
    #	43730193 bases of 446627861 (9.791%) in intersection


#########################################################################
# BLASTZ Zebrafish danRer5 (DONE - 2007-09-11 - 2007-09-12 - Hiram)
#	re-run a second time with BLASTZ_Q, see below
    ssh kkstore06
    screen	# use screen to manage this job
    mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-11
    cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-11

    cat << '_EOF_' > DEF
# Mouse (mm9) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    222m47.787s
    cat fb.mm9.chainDanRer5Link.txt
    #	48497464 bases of 2620346127 (1.851%) in intersection

    mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
    cd /cluster/data/danRer5/bed/blastz.mm9.swap
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzDanRer5.2007-09-11/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    9m47.163s
    cat fb.danRer5.chainMm9Link.txt
    #	34017483 bases of 1435609608 (2.370%) in intersection

#########################################################################
# BLASTZ Zebrafish danRer5 (DONE - 2007-09-13 - Hiram)
#	second time, forgot to include BLASTZ_Q the first time
    ssh kkstore06
    screen	# use screen to manage this job
    mkdir /cluster/data/mm9/bed/blastzDanRer5.2007-09-13
    cd /cluster/data/mm9/bed/blastzDanRer5.2007-09-13

    #	This is the wrong way overlap, but it seems to work
    cat << '_EOF_' > DEF
# Mouse (mm9) vs zebrafish (danRer5)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - zebrafish (danRer5)
SEQ2_DIR=/scratch/data/danRer5/danRer5.2bit
SEQ2_LEN=/cluster/data/danRer5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzDanRer5.2007-09-13
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    369m16.947s
    cat fb.mm9.chainDanRer5Link.txt
    #	84513268 bases of 2620346127 (3.225%) in intersection

    mkdir /cluster/data/danRer5/bed/blastz.mm9.swap
    cd /cluster/data/danRer5/bed/blastz.mm9.swap
    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzDanRer5.2007-09-13/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> swap.log 2>&1 &
    #	real    21m44.784s
    cat fb.danRer5.chainMm9Link.txt
    #	66400782 bases of 1435609608 (4.625%) in intersection

#########################################################################
# BLASTZ/CHAIN/NET Guinea Pig cavPor2 (DONE - 2007-09-19 - kate)
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/blastzCavPor2.2007-09-19
    cd /cluster/data/mm9/bed/blastzCavPor2.2007-09-19

    cat << '_EOF_' > DEF
# mouse vs guinea pig
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Guinea pig cavPor2
SEQ2_DIR=/san/sanvol1/scratch/cavPor2/cavPor2.2bit
SEQ2_LEN=/san/sanvol1/scratch/cavPor2/chrom.sizes

# chunking similar to cat (similar number of scaffolds)
SEQ2_CHUNK=30000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzCavPor2.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
        -bigClusterHub=pk >& do.log  &

    # load nets manually -- automated loading fails as classification info 
    #  not available (no database)
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastz.cavPor2/axtChain
    netFilter -minGap=10 noClass.net | hgLoadNet -warn mm9 netCavPor2 stdin
    netFilter -minGap=10 mm9.cavPor2.rbest.net.gz |  \
        hgLoadNet -warn mm9 netRBestCavPor2 stdin

    doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=3000 -chainLinearGap=medium \
        -continue=download >& do2.log &

    # reciprocal best net mafs for multiz
    ~/kent/src/hg/utils/automation/doRecipBest.pl mm9 cavPor2 >&! rbest.log &

    time nice -n +19 featureBits mm9 chainCavPor2Link \
	> fb.mm9.chainCavPor2Link.txt 2>&1
    cat fb.mm9.chainCavPor2Link.txt
    #	480194223 bases of 2620346127 (18.326%) in intersection

    #	create the syntenic maf nets (these are unneeded):
    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -chainMinScore=3000 \
	-chainLinearGap=medium -continue=syntenicNet -syntenicNet \
	-bigClusterHub=pk > syntenicNet.log 2>&1

#########################################################################
## 4-Way Multiz (DONE - 2007-09-07 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/multiz4way
    cd /cluster/data/mm9/bed/multiz4way

    ln -s ../multiz30way/mm9.guess.30way.nh ./30way.nh

leave mm9 rn4, canFam2 and hg18
    /cluster/bin/phast/tree_doctor \
	--prune panTro2,ponAbe1,rheMac2,calJac1,otoGar1,tupBel1,cavPor2,oryCun1,sorAra1,eriEur1,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer4  30way.nh

    # this leaves us with:
    
    cat << '_EOF_' > 4way.nh
((hg18:0.126901,
	(rn4:0.084383,mm9:0.076274):0.249544):0.019763,canFam2:0.187963);
'_EOF_'
    # << happy emacs

    #	Use this specification in the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to obtain a gif image for htdocs/images/phylo/mm9_4way.gif

    /cluster/bin/phast/all_dists 4way.nh > 4way.distances.txt
    #	Use this output to create the table below
    grep -y mm9 4way.distances.txt | sort -k3,3n
#
#	If you can fill in all the numbers in this table, you are ready for
#	the multiple alignment procedure
#
#                         featureBits chainLink measures
#                                        chainOryLat1Link   chain    linearGap
#    distance                      on mm9    on other   minScore
#  1  0.160657 - rat rn4       (% 65.380) (% xx.xxx)       5000     medium
#  2  0.452719 - human hg18    (% 38.499) (% 35.201)       3000     medium
#  3  0.533544 - dog canFam2   (% 32.362) (% 34.891)       3000     medium

    #	using the syntenic nets
    cd /cluster/data/mm9/bed/multiz4way
    mkdir mafLinks
    mkdir mafLinks/rn4
    cd mafLinks/rn4
    ln -s ../../../blastzRn4.2007-08-31/mafSynNet/*.maf.gz .
    mkdir ../hg18
    cd ../hg18
    ln -s ../../../blastz.hg18/mafSynNet/*.maf.gz .
    mkdir ../canFam2
    cd ../canFam2
    ln -s ../../../blastz.canFam2/mafSynNet/*.maf.gz .

    #	Copy MAFs to some appropriate NFS server for kluster run
    mkdir /san/sanvol1/scratch/mm9/multiz4way
    cd /san/sanvol1/scratch/mm9/multiz4way
    time nice -n +19 rsync -a --copy-links --progress \
	/cluster/data/mm9/bed/multiz4way/mafLinks/ .
    #	1 minute to copy 2.4 Gb

    #	determine what is the newest version of multiz and use that
    mkdir penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/multiz penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/maf_project penn
    cp -p /cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba/autoMZ penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/mm9/bed/multiz4way

    # create species list and stripped down tree for autoMZ
    sed 's/[a-z][a-z]*_//g; s/:[0-9\.][0-9\.]*//g; s/;//; /^ *$/d' \
	4way.nh > tmp.nh
    echo `cat tmp.nh` > tree-commas.nh
    echo `cat tree-commas.nh` | sed 's/ //g; s/,/ /g' > tree.nh
    sed 's/[()]//g; s/,/ /g' tree.nh > species.lst

    mkdir run maf
    cd run

    #	NOTE: you need to set the db and multiz dirname properly in this script
    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm9
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz4way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz4way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz4way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 35 jobs
    para try ... check ... push ... etc ...
# Completed: 35 of 35 jobs
# CPU time in finished jobs:      27901s     465.02m     7.75h    0.32d  0.001 y
# IO & Wait Time:                   562s       9.37m     0.16h    0.01d  0.000 y
# Average job time:                 813s      13.55m     0.23h    0.01d
# Longest finished job:            2222s      37.03m     0.62h    0.03d
# Submission to last job:          2222s      37.03m     0.62h    0.03d

    #	combine results into a single file for loading and gbdb reference
    ssh kkstore06
    cd /cluster/data/mm9/bed/multiz4way
    time nice -n +19 catDir maf > multiz4way.maf
    #	real    2m43.409s

    #	makes a 6.5 Gb file:
    #	-rw-rw-r--  1 6883356263 Sep  7 11:00 multiz4way.maf

    #	Create per-chrom individual maf files for downloads
    #	NOT NECESSARY HERE - DONE LATER WITH THE ANNOTATED MAFS
    ssh kkstore04
    cd /cluster/data/mm9/bed/multiz4way
    mkdir mafDownloads
    time for M in maf/chr*.maf
    do
	B=`basename $M`
	cp -p ${M} mafDownloads/${B}
	gzip mafDownloads/${B}
	echo ${B} done
    done
    #	real    5m9.273

    #	deliver to downloads *!* NOT NECESSARY HERE - DONE LATER WITH
    #		THE ANNOTATED MAFS
    ssh hgwdev
    ln -s /cluster/data/mm9/bed/multiz4way/mafDownloads \
	/usr/local/apache/htdocs/goldenPath/mm9/multiz4way

    # Load into database
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz4way
    mkdir /gbdb/mm9/multiz4way
    ln -s /cluster/data/mm9/bed/multiz4way/multiz4way.maf \
	/gbdb/mm9/multiz4way
    time nice -n +19 hgLoadMaf mm9 multiz4way
    #	Loaded 5072051 mafs in 1 files from /gbdb/mm9/multiz4way
    #	real    2m33.680s

    time nice -n +19 hgLoadMafSummary -minSize=10000 -mergeGap=500 \
	-maxSize=50000 mm9 multiz4waySummary multiz4way.maf
    #	Created 1330454 summary blocks from 9893113 components
    #	and 5068764 mafs from multiz4way.maf
    #	real    3m27.620s

    #	Create tree image for details page
    #	You can get a better image from the phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    # with mm9 on top:
(((mouse_mm9:0.076274,rat_rn4:0.084383):0.249544,human_hg18:0.126901):0.019763,
dog_canFam2:0.187963);

#########################################################################
### GNF ATLAS 2 - required for UCSC Gene/Gene Sorter build
#	(DONE - 2007-09-10 - Hiram)
    # Align probes from GNF1M chip.
    ssh pk
    mkdir -p /cluster/data/mm9/bed/geneAtlas2/run/psl
    cd /cluster/data/mm9/bed/geneAtlas2/run

    cut -f1 /cluster/data/mm9/chrom.sizes > genome.list

    ls -1 /cluster/bluearc/geneAtlas2/gnf1m.fa > probe.list

    cat << '_EOF_' > template
#LOOP
blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 genome.list probe.list template jobList
    para create jobList
    para try ... check ... push ... etc.
    para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs:      14865s     247.75m     4.13h    0.17d  0.000 y
# IO & Wait Time:                   160s       2.66m     0.04h    0.00d  0.000 y
# Average job time:                 429s       7.15m     0.12h    0.00d
# Longest finished job:            1151s      19.18m     0.32h    0.01d
# Submission to last job:          1166s      19.43m     0.32h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
	../affyGnf1m.psl /dev/null

    # Load probes and alignments from GNF1H into database.
    ssh hgwdev
    cd /cluster/data/mm9/bed/geneAtlas2
#    ln -s /projects/compbio/data/microarray/geneAtlas2/mouse/gnf1m.fa /gbdb/hgFixed/affyProbes
    hgLoadPsl mm9 affyGnf1m.psl
    hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/gnf1m.fa
    #	31309 sequences

    # Load up track
    hgMapMicroarray gnfAtlas2.bed hgFixed.gnfMouseAtlas2MedianRatio \
    	affyGnf1m.psl
    #Loaded 34863 rows of expression data from hgFixed.gnfMouseAtlas2MedianRatio
    #	Mapped 30117,  multiply-mapped 1723, missed 882, unmapped 4746

    # Note that the unmapped 5000 records are from all-N sequences.
    hgLoadBed mm9 gnfAtlas2 gnfAtlas2.bed
    #	Loaded 31840 elements of size 15
    featureBits mm9 gnfAtlas2
    #	12921627 bases of 2620346127 (0.493%) in intersection
    featureBits mm8 gnfAtlas2
    #	12858280 bases of 2567283971 (0.501%) in intersection

    #	during the build of UCSC genes, this sequence takes place:
    hgMapToGene mm9 affyGnf1m knownGene knownToGnf1m
    hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
    #	this hgExpDistance command takes some time, maybe an hour or so ?
    #	Have 34863 elements in hgFixed.gnfMouseAtlas2MedianRatio
    #	Got 31145 unique elements in hgFixed.gnfMouseAtlas2MedianRatio
    hgMapToGene mm9 gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'

############################################################################
### affyU74 TRACK - needed for the Gene Sorter (DONE - 2007-09-10 - Hiram)
#                              
# MAKE THE affyU74 TRACK using Affy consensus sequences instead of 
# target sequences. Recalculate alignments and load data
#
#	The affy data has previously been loaded to iscratch in:
#	/iscratch/i/affy
# It originates from:
# /projects/compbio/data/microarray/affyGnfMouse/sequences/

    # Run cluster job to do alignments
    ssh kk
    mkdir -p /cluster/data/mm9/bed/affyU74/run/psl
    cd /cluster/data/mm9/bed/affyU74/run
    cut -f1 /cluster/data/mm9/chrom.sizes > genome.list
    ls -1 /iscratch/i/affy/U74*consensus.fa > affy.list
    cat << '_EOF_' > template
#LOOP
blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs

    gensub2 genome.list affy.list template jobList
    para create jobList
    para try ... check ... push ... etc.
    para time
# Completed: 105 of 105 jobs
# CPU time in finished jobs:       5891s      98.18m     1.64h    0.07d  0.000 y
# IO & Wait Time:                   738s      12.31m     0.21h    0.01d  0.000 y
# Average job time:                  63s       1.05m     0.02h    0.00d
# Longest finished job:             199s       3.32m     0.06h    0.00d
# Submission to last job:           215s       3.58m     0.06h    0.00d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
    pslSort dirs raw.psl tmp psl

# change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
#pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl \
	../all_affyU74.psl /dev/null
    #	Processed 40512 alignments

# Sort by chromosome and load into database.
    ssh hgwdev
    cd /cluster/data/mm9/bed/affyU74
    pslSortAcc nohead chrom temp all_affyU74.psl
    #	Processed 30609 lines into 1 temp files
    cat chrom/*.psl > affyU74.psl

# shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table

    mv affyU74.psl affyU74.psl.orig

    cut -f 1-9 affyU74.psl.orig >j1.tmp
    cut -f 10 affyU74.psl.orig | sed -e 's/:/\t/' | cut -f 2 > j2.tmp
    cut -f 11-21 affyU74.psl.orig >j3.tmp
    paste j1.tmp j2.tmp j3.tmp >affyU74.psl

    hgLoadPsl mm9 affyU74.psl
    rm -rf chrom temp run j?.tmp

    #	creating the gene sorter tables runs the following:
    hgMapToGene mm9 affyU74  knownGene knownToU74

############################################################################
##   MAKE THE affyGnfU74 TRACKs (DONE 3/8/06, Fan)
# Make bed files and load consensus sequences for Affy U74 chip set.

    #This needs to be done after affyU74 is already made.
    ssh hgwdev
    mkdir -p /cluster/data/mm9/bed/affyGnf
    cd /cluster/data/mm9/bed/affyGnf
#	may need to build this command in src/hg/affyGnf
~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 \
	affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
# 89 experiments
# 10043 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 10309 records written to affyGnfU74A.bed

~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt \
	affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
# 20 experiments
# 12477 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 11324 records written to affyGnfU74B.bed

~/src/hg/affyGnf/affyPslAndAtlasToBed ../affyU74/affyU74.psl \
	/projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt \
	affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2
# 20 experiments
# 11934 rows of expression data
# 30609 records in ../affyU74/affyU74.psl
# 7773 records written to affyGnfU74C.bed

# edit 3 .bed files to shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
#   (these files do not appear to have these long names in them to begin with)
    mkdir sav
    mv *.bed sav
    sed -e "s/U74Av2://" sav/affyGnfU74A.bed > affyGnfU74A.bed
    sed -e "s/U74Bv2://" sav/affyGnfU74B.bed > affyGnfU74B.bed
    sed -e "s/U74Cv2://" sav/affyGnfU74C.bed > affyGnfU74C.bed

    # and reload data into table
    hgLoadBed mm9 affyGnfU74A affyGnfU74A.bed
    #	Loaded 10309 elements of size 15
    hgLoadBed mm9 affyGnfU74B affyGnfU74B.bed
    #	Loaded 11324 elements of size 15
    hgLoadBed mm9 affyGnfU74C affyGnfU74C.bed
    #	Loaded 7773 elements of size 15

    # Add in sequence data for U74 tracks.
    #	This business is already in gbdb - 2007-00-10 - Hiram
    #	You do not need to repeat this symlink sequence
    # Copy consensus sequence to /gbdb if it isn't already
    #    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    # fix broken symlinks after directory structure changed
    # /projects/compbiodata ----> /projects/compbio/data
    rm U74*
    # make correct symlinks (hartera, 2005-05-03)
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbio/data/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .

    # used perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # ASSUMED THIS IS ALREADY DONE LAST TIME FOR MM4.
    # reload sequences with prefix removed so acc matches name used in
    # other dependent tables
                                                    
    hgLoadSeq -abbr=U74Av2: mm9 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    #	12422 sequences
    hgLoadSeq -abbr=U74Bv2: mm9 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    #	12411 sequences
    hgLoadSeq -abbr=U74Cv2: mm9 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
    #	11868 sequences

    #	building the gene sorter runs the following commands
    hgExpDistance mm9 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
    #	real    7m6.223s
    #	Have 9636 elements in affyGnfU74A
    #	Got 15902 unique elements in affyGnfU74A
    hgExpDistance mm9 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
    #	real    2m12.727s
    #	Have 11025 elements in affyGnfU74B
    #	Got 10442 unique elements in affyGnfU74B
    hgExpDistance mm9 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
    #	real    0m29.270s
    #	Have 7487 elements in affyGnfU74C
    #	Got 3259 unique elements in affyGnfU74C

##########################################################################
# BUILD NIBB IMAGE PROGES (DONE - 2007-09-10 - Hiram)
    ssh pk
    mkdir -p /cluster/data/mm9/bed/nibbPics/run
    cd /cluster/data/mm9/bed/nibbPics
    cp -p /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa .
    cd run
    mkdir psl
    ls -1 /scratch/data/mm9/nib/*.nib > genome.list
    echo ../nibbImageProbes.fa > probe.list

# Create parasol gensub file file
cat << '_EOF_' > template
#LOOP
blatz -rna -minScore=6000 -out=psl $(path1) $(path2) psl/$(root1)_$(root2).psl
#ENDLOOP
'_EOF_'
    # << happy emacs

# Create parasol batch
    gensub2 genome.list probe.list template jobList
    para create jobList
    para try ... check ... push ... etc... time
# Completed: 35 of 35 jobs
# CPU time in finished jobs:       9983s     166.39m     2.77h    0.12d  0.000 y
# IO & Wait Time:                   146s       2.43m     0.04h    0.00d  0.000 y
# Average job time:                 289s       4.82m     0.08h    0.00d
# Longest finished job:             729s      12.15m     0.20h    0.01d
# Submission to last job:           729s      12.15m     0.20h    0.01d

# Make sort and filter
    catDir psl | sort -k 10 \
        | pslReps stdin stdout /dev/null -nohead -minAli=0.60 \
		-nearTop=0.001 -minCover=0.10 -minNearTopSize=80 \
	| sort -k 14,14 -k 16,16n \
	| sed 's#/scratch/data/mm9/nib/chr#chr#' \
	| sed 's/.nib//' > ../nibbImageProbes.psl

# Make bed file and copy in stuff
    ssh hgwdev
    cd /cluster/data/mm9/bed/nibbPics

# Load into database
    ln -s /cluster/data/xenTro1/bed/nibbPics/nibbImageProbes.fa \
	/gbdb/mm9/nibbImageProbes.fa
    hgLoadSeq mm9 /gbdb/mm9/nibbImageProbes.fa
    hgLoadPsl mm9 nibbImageProbes.psl

#########################################################################
# Creating visiGene tables for gene sorter business
#	(DONE - 2007-09-10 - Hiram)
    #	This businesss has cumulative effects on the visiGene database
    #	for safety purposes, backup the visiGene database
    ssh hgwdev
    mkdir -p /cluster/data/mm9/bed/vgProbes/visiGene.bak
    cd /cluster/data/mm9/bed/vgProbes/visiGene.bak
    hgsqldump --all -c --tab=. visiGene

    cd /cluster/data/mm9/bed/vgProbes
    mkdir working
    cd /cluster/data/mm9/bed/vgProbes
    cp -p ~/kent/src/hg/visiGene/vgProbeTrack/*.sql .
    #	this SEQ appears to find nothing new ?
    vgProbeTrack SEQ working mm9
rc = 0 = count of primers for mrna search for taxon 10090
rc = 0 = count of primers for genome search for taxon 10090
bac list read done.
found seq for 0 bacEndPairs
rc = 0 = count of refSeq mrna for mm9
rc = 0 = count of genRef mrna for mm9
rc = 0 = count of genbank mrna for mm9
rc = 0 = count of flatRef mrna for mm9
rc = 0 = count of flatAll mrna for mm9
rc = 0 = count of linkRef mrna for mm9
rc = 0 = count of linkAll mrna for mm9
rc = 0 = count of kgAlRef mrna for mm9
rc = 0 = count of kgAlAll mrna for mm9

    #	and then, this creates the vgProbes table in mm9
    vgProbeTrack ALI working mm9 -sqlPath=..
    hgsql -e "select count(*) from vgProbes;" mm9
    #	24924
    hgsql -e "select count(*) from vgProbes;" mm8
    #	24615

    #	this appears to build working/vgPrbExt.fa and it loaded some sequences
    vgProbeTrack EXT working mm9
    #	this copies over all the items from vgProbes to start vgAllProbes
    vgProbeTrack SELFMAP working mm9 -sqlPath=..
    #	this adds frog alignments to vgAllProbes
    vgProbeTrack -sqlPath=.. REMAP working mm9 nibb nibbImageProbes \
	/gbdb/mm9/nibbImageProbes.fa
    hgsql -e "select count(*) from vgAllProbes;" mm9
    #	26289
    hgsql -e "select count(*) from vgAllProbes;" mm8
    #	25994

    #	finally, gathering together all alignments used and updates seq table
    vgProbeTrack EXTALL working mm9

    #	Then, during the gene sorter build, it does:
    knownToVisiGene mm9
    vgGetText visiGene.text mm7 mm8 mm9 hg17 hg18
    #	probe has 26611 rows
    #	gene has 20413 rows
    #	imageProbe has 125765 rows
    wc -l visiGene.text
    #	124186 visiGene.text
    #	compare to existing:
    wc -l /usr/local/apache/cgi-bin/visiGeneData/visiGene.text
    #	124186 /usr/local/apache/cgi-bin/visiGeneData/visiGene.text

#########################################################################
# Create Allen Brain Atlas mapping. (DONE - 2007-09-24 - Hiram)

# Set up directory
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/allenBrain
    cd /cluster/data/mm9/bed/allenBrain

    # find most recent update of allProbes.fa to use for these alignments

    cp -p /cluster/data/mm6/bed/allenBrain/allProbes.fa ./allenBrainProbes.fa
    cp -p /cluster/data/mm6/bed/allenBrain/allenBrainUrl.tab .

# Set up a blat run to align the probes.
    mkdir split
    faSplit sequence allenBrainProbes.fa 200 split/rp
    mkdir run
    ssh pk
    cd /cluster/data/mm9/bed/allenBrain/run
    ls -1 ../split/*.fa > probe.list
    ls -1 /scratch/data/mm9/nib/*.nib > genome.list
    mkdir psl
    cat << '_EOF_' > template
#LOOP
runBlat $(path1) $(path2) $(root1) $(root2) {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs
    cat << '_EOF_' > runBlat
#!/bin/csh -ef
set ooc = /scratch/data/mm9/11.ooc
set tmpDir = /scratch/tmp/mm9
set workDir = $tmpDir/$3_$4
set pslOut = $3_$4.psl
mkdir -p $tmpDir
mkdir $workDir
blat -ooc=$ooc $1 $2 $workDir/$pslOut
mv $workDir/$pslOut psl/$pslOut
rmdir $workDir
rmdir --ignore-fail-on-non-empty $tmpDir
'_EOF_'
    # << happy emacs
    chmod +x runBlat

    gensub2 genome.list probe.list template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 6790 of 6790 jobs
# CPU time in finished jobs:      28129s     468.81m     7.81h    0.33d  0.001 y
# IO & Wait Time:                 23014s     383.57m     6.39h    0.27d  0.001 y
# Average job time:                   8s       0.13m     0.00h    0.00d
# Longest finished job:              29s       0.48m     0.01h    0.00d
# Submission to last job:           363s       6.05m     0.10h    0.00d

# Then do sorting and near-best-in-genome step on file server
    ssh kkstore06
    cd /cluster/data/mm9/bed/allenBrain/run
    pslSort dirs raw.psl tmp psl
    pslReps raw.psl ../best.psl -nohead -minCover=0.20 -minAli=0.96 \
	-nearTop=0.001 /dev/null
    #	Processed 63183 alignments
    sort -k14,14 -k16,16n ../best.psl > ../allenBrainAli.psl

# Clean up big files no longer needed
   rm raw.psl batch.bak
   rm -r psl
   rm -r ../split

# Load up database
   ssh hgwdev
   cd /cluster/data/mm9/bed/allenBrain

# Make a new table that contains the URLs for the allen brain genes
# Make this one first since all.joiner considers it the master table.
   hgsql mm9 < ~/kent/src/hg/lib/allenBrainUrl.sql
   hgsql mm9 -e \
	'load data local infile "allenBrainUrl.tab" into table allenBrainUrl;'

# Make probe alignment table, and load sequence.
   hgLoadPsl mm9 allenBrainAli.psl
   mkdir /gbdb/mm9/allenBrain
   ln -s /cluster/data/mm9/bed/allenBrain/allenBrainProbes.fa \
	/gbdb/mm9/allenBrain/allenBrainProbes.fa
   hgLoadSeq -replace mm9 /gbdb/mm9/allenBrain/allenBrainProbes.fa

# Make mapping between known genes and allenBrain	
   hgMapToGene mm9 allenBrainAli -type=psl knownGene knownToAllenBrain 

#########################################################################
# MOUSE AFFYMETRIX MOE430 TRACK (DONE - 2007-09-10 - Hiram)
#    mkdir -p /projects/compbio/data/microarray/affyMouse
    # Download MOE430A and MOE430B consensus sequences from Affymetrix web site
    # http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430
#    unzip MOE430*_consensus.zip

    # check for duplicate probes: there are none, all have unique names
    # check for duplicate probes: 100 from 136745_at to 1367551_a_at
    # remove "consensus:" and ";" from FASTA headers to shorten probeset
    # names for database

#    sed -e 's/consensus://' MOE430A_consensus | sed -e 's/;/ /' > MOE430_all.fa
#    sed -e 's/consensus://' MOE430B_consensus | sed -e 's/;/ /' >> MOE430_all.fa
 
#    cp /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \
#       /cluster/bluearc/affy/

    # THE ABOVE WAS ALREADY TBD)

    # Set up cluster job to align MOE430 consensus sequences to mm9

    ssh kk
    mkdir /cluster/data/mm9/bed/affyMOE430
    cd /cluster/data/mm9/bed/affyMOE430

    ls -1 /iscratch/i/affy/MOE430_all.fa > probe.list
    cut -f1 /cluster/data/mm9/chrom.sizes > genome.list

    cat << '_EOF_' > template
#LOOP
blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 genome.list probe.list template jobList
    mkdir psl
    para create jobList
    # Do the job with usual para try/check/push/time etc.
# Completed: 35 of 35 jobs
# CPU time in finished jobs:       9093s     151.55m     2.53h    0.11d  0.000 y
# IO & Wait Time:                   217s       3.62m     0.06h    0.00d  0.000 y
# Average job time:                 266s       4.43m     0.07h    0.00d
# Longest finished job:             602s      10.03m     0.17h    0.01d
# Submission to last job:           602s      10.03m     0.17h    0.01d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create affyRAE230.psl
    pslSort dirs raw.psl tmp psl

    # only use alignments that cover 30% of sequence and have at least
    # 95% identity in aligned region. 
    # low minCover as a lot of n's in these sequences
    pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 \
	raw.psl affyMOE430.psl /dev/null

    # Load alignments and sequences into database
    ssh hgwdev
    cd /cluster/data/mm9/bed/affyMOE430
    # shorten names in psl file
    sed -e 's/MOE430//' affyMOE430.psl > affyMOE430.psl.bak
    mv affyMOE430.psl.bak affyMOE430.psl

    # load track into database

    hgLoadPsl mm9 affyMOE430.psl
 
    # Add consensus sequences for MOE430
    # Copy sequences to gbdb is they are not there already
#    mkdir -p /gbdb/hgFixed/affyProbes
#    ln -s /projects/compbio/data/microarray/affyMouse/MOE430_all.fa \ 
#       /gbdb/hgFixed/affyProbes

    hgLoadSeq -abbr=MOE430 mm9 /gbdb/hgFixed/affyProbes/MOE430_all.fa
    
    # Clean up
    rm batch.bak raw.psl 

    #	and then, during the gene sorter build, it does:
    hgMapToGene mm9 affyMOE430 knownGene knownToMOE430
    hgMapToGene mm9 affyMOE430 -prefix=A: knownGene knownToMOE430A

#########################################################################
#  creating UCSC genes track (DONE - 2007-08-31 - 2007-09-25 - Hiram)
    #  working on the script mm9.ucscGenes10.csh in src/hg/makeDb/doc
    #	The tracks created above were done as they were encountered
    #	in working through that script.   Worked through that script
    #	approximately one kluster run at a time, using a large if (1 == 0)
    #	statement to skip over business that had been successfully completed.
    #	After it reached the point where it had begun to load the tables
    #	into the tempDb and started to fail at the missing tables affyGnf1m
    #	the successfully loaded tables in tempDb were moved to mm9 and
    #	the track began to function.  Then, working through the affy
    #	alignments above, and completing the loading of the knownTo tables
    #	for the gene sorter as they were completed.  Now continuing below
    #	with the rest of the steps manually since it is not necessary to
    #	use the tempDb and its /gbdb/ directory.  Everything is not taking
    #	place in the mm9 database.

    # example script to transfer appropriate tables from one DB to another
    # while saving the first set too

hgsql -N -e "show tables;" mm9UCGenes | \
egrep -v "allenBrainAli|allenBrainUrl|extFile|knownToEnsembl|vgProbes|vgAllProbe
s|^seq$|trackDb|history|chromInfo" | while read T
do
    echo -n "=== table ${T}: "
    C1=`hgsql -N -e "select count(*) from ${T}" mm9`
    C2=`hgsql -N -e "select count(*) from ${T}" mm9UCGenes 2> /dev/null`
    D=`echo "${C1}" "${C2}" | awk '{printf "%d", $2-$1}'`
    echo "${C1} - ${C2} - ${D}"
    echo "rename table mm9.${T} to mm9UCGenes.${T}_try0"
    echo "rename table mm9UCGenes.${T} to mm9.${T}"
    hgsql -e "rename table mm9.${T} to mm9UCGenes.${T}_try0" mysql
    hgsql -e "rename table mm9UCGenes.${T} to mm9.${T}" mysql
done
    #	The egrep -v knocks out tables that are redundant, should be the same
    #	in both DBs

#########################################################################
# running the blastP operation to the other genomes for the gene sorter
#	(DONE - 2007-09-10 - Hiram)
    mkdir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
    cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp

    cat << '_EOF_' > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix known
targetDb mm9
queryDbs hg18 rn4 danRer4 dm2 ce4 sacCer1

mm9Fa /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa
hg18Fa /cluster/data/hg18/bed/blastp/known.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa
dm2Fa /cluster/data/dm2/bed/flybase4.2/flybasePep.fa
ce4Fa /cluster/data/ce4/bed/hgNearBlastp/070731/ce4.sangerPep.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa

buildDir /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
scratchDir /san/sanvol1/scratch/mm9/jkgHgNearBlastp
'_EOF_'
    # << happy emacs
    #	takes about an hour
    time nice -n +19 doHgNearBlastp.pl config.ra > do.log 2>&1 &

#########################################################################
# fixup the blastP tables to remove non-syntenic hits
#	(DONE - 2007-09-11 - Hiram)
#  This was all re-done 2007-09-25, see below:
######  Update blast tabs after UCSC genes rebuild (DONE - 2007-09-25 - Hiram)
# Remove non-syntenic hits for human and rat
# Takes a few minutes
    cd /cluster/data/mm9/bed/ucsc.10
    synBlastp.csh mm9 rn4
# old number of unique query values: 31610
# old number of unique target values 7072
# new number of unique query values: 13973
# new number of unique target values 6888
    synBlastp.csh mm9 hg18
# old number of unique query values: 38136
# old number of unique target values 17214
# new number of unique query values: 0
# new number of unique target values 0

    # Make reciprocal best subset for the blastp pairs that are too
    # Far for synteny to help
    cd /cluster/data/mm9/bed/ucsc.10/hgNearBlastp
    # Us vs. others
foreach otherDb (danRer4 dm2 ce4 sacCer1)
    set aToB = run.mm9.$otherDb
    set bToA = run.$otherDb.mm9
    cat $aToB/out/*.tab > $aToB/all.tab
    cat $bToA/out/*.tab > $bToA/all.tab
    blastRecipBest $aToB/all.tab $bToA/all.tab \
	$aToB/recipBest.tab $bToA/recipBest.tab
    hgLoadBlastTab mm9 drBlastTab $aToB/recipBest.tab
    hgLoadBlastTab $otherDb tfBlastTab $bToA/recipBest.tab
end
    # Clean up
    cat run.mm9.mm9/out/*.tab | gzip -c > run.mm9.mm9/all.tab.gz
    cat run.mm9.hg18/out/*.tab | gzip -c > run.mm9.hg18/all.tab.gz
    cat run.hg18.mm9/out/*.tab | gzip -c > run.hg18.mm9/all.tab.gz
    cat run.mm9.rn4/out/*.tab | gzip -c > run.mm9.rn4/all.tab.gz
    cat run.rn4.mm9/out/*.tab | gzip -c > run.rn4.mm9/all.tab.gz
    gzip run.*/all.tab
    rm -r run.*/out

#########################################################################
#  Update BLASTTAB blast tabs after UCSC genes rebuild
##	(DONE - 2007-09-25 - Hiram)
    sh hgwdev
    mkdir -p /cluster/data/mm9/bed/hgNearBlastp/070924
    cd /cluster/data/mm9/bed/hgNearBlastp/070924
    # Get the proteins used by all hgNear organisms:
    pepPredToFa hg18 knownGenePep hg18.known.faa
    pepPredToFa mm9 knownGenePep mm9.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer4 ensPep danRer4.ensPep.faa
    pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
    pepPredToFa ce4 sangerPep ce4.sangerPep.faa
    pepPredToFa sacCer1 sgdPep sacCer1.sgdPep.faa

    cat << '_EOF_' > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix known
targetDb mm9
queryDbs hg18 rn4 danRer4 dm3 ce4 sacCer1
recipBest         danRer4 dm3 ce4 sacCer1

mm9Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/mm9.known.faa
hg18Fa    /cluster/data/mm9/bed/hgNearBlastp/070924/hg18.known.faa
rn4Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/rn4.known.faa
danRer4Fa /cluster/data/mm9/bed/hgNearBlastp/070924/danRer4.ensPep.faa
dm3Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/dm3.flyBasePep.faa
ce4Fa     /cluster/data/mm9/bed/hgNearBlastp/070924/ce4.sangerPep.faa
sacCer1Fa /cluster/data/mm9/bed/hgNearBlastp/070924/sacCer1.sgdPep.faa

buildDir /cluster/data/mm9/bed/hgNearBlastp/070924
scratchDir /san/sanvol1/scratch/mm9HgNearBlastp
'_EOF_'
    # << happy emacs

    # Run with -noLoad so we can eyeball files, manually load mm9 tables now,
    # and after release of mm9 Gene Sorter on the RR, overload other 
    # databases' mmBlastTab tables.
    time nice -n +19 doHgNearBlastp.pl -noLoad config.ra > do.log 2>&1 &
    tail -f do.log

Follow instructions at end of do.log, piecewise:
  - first execute all of the run.mm9.* load scripts
  - then execute the run.hg18.mm9 and run.rn4.mm9 scripts
  - then run Galt's script (this is why we load hg18 and rn4 early):
    synBlastp.csh mm9 hg18
    synBlastp.csh mm9 rn4
  -- The following was performed 2007-10-11
  - After mm9 hgNear/Gene Sorter is enabled on the RR:
    - run the remaining run.*.mm9 load scripts
    - then modify each $queryDb's hgGeneData/$org/$queryDb/otherOrg.ra
      to specify mm9 for mouse
    - then do a push request for $queryDbs.mmBlastTab and hgGeneData

#########################################################################
# MAKE FOLDUTR TABLES  (DONE - 2007-09-11 - Hiram)
# First set up directory structure and extract UTR sequence on hgwdev
#	Beware running this on pk since the program RNAfold which is used
#	during this process is only found on /cluster/bin/i386/
#	And there is no way for this cluster setup to verify success
#	of that program since it is hidden away in rnaFoldBig
#	Need to fix rnaFoldBig to recognize RNAfold missing ...
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/ucsc.10/rnaStruct
    cd /cluster/data/mm9/bed/ucsc.10/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm9 knownGene utr3 utr3/utr.fa
    utrFa mm9 knownGene utr5 utr5/utr.fa

    # Split up files and make files that define job.
    faSplit sequence utr3/utr.fa 10000 utr3/split/s
    faSplit sequence utr5/utr.fa 10000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > template << '_EOF_'
#LOOP
rnaFoldBig split/$(path1) fold
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 in.lst single template jobList
    cp -p template ../utr5
    cd ../utr5
    gensub2 in.lst single template jobList

    ssh kk
    cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr3
    para make jobList
# Completed: 9750 of 9750 jobs
# CPU time in finished jobs:     377924s    6298.73m   104.98h    4.37d  0.012 y
# IO & Wait Time:                 38985s     649.75m    10.83h    0.45d  0.001 y
# Average job time:                  43s       0.71m     0.01h    0.00d
# Longest finished job:            3432s      57.20m     0.95h    0.04d
# Submission to last job:         11280s     188.00m     3.13h    0.13d
    cd ../utr5
    para make jobList
# Completed: 9253 of 9253 jobs
# CPU time in finished jobs:      44949s     749.16m    12.49h    0.52d  0.001 y
# IO & Wait Time:                 51547s     859.11m    14.32h    0.60d  0.002 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest finished job:            1100s      18.33m     0.31h    0.01d
# Submission to last job:          1398s      23.30m     0.39h    0.02d

    # Load database
    ssh hgwdev
    cd /cluster/data/mm9/bed/ucsc.10/rnaStruct/utr5
    hgLoadRnaFold mm9 foldUtr5 fold
    #	Parsed 35796 files
    cd ../utr3
    hgLoadRnaFold -warnEmpty mm9 foldUtr3 fold
    #	only one is empty: uc009gyo.1
    # Seems to be a problem in
    # RNAfold, so not easy for us to fix. Consequence is not too bad, just a
    # few 3' UTRs will be missing annotation.  (in this case, only one)

    # Clean up
    tar cvzf ./fold.tgz ./fold
    rm -r split fold err batch.bak
    cd ../utr5
    tar cvzf ./fold.tgz ./fold
    rm -r split fold err batch.bak
#########################################################################
# Make pfam run.  Actual cluster run is about 6 hours.
#	(DONE - 2007-09-12 - Hiram)
# First get pfam global HMMs into /san/sanvol1/pfam somehow.
    ssh pk
    mkdir /san/sanvol1/scratch/mm9/ucscGenes
    cd /san/sanvol1/scratch/mm9/ucscGenes
    mkdir splitProt
    faSplit sequence /cluster/data/mm9/bed/ucsc.10/ucscGenes.faa \
	10000 splitProt/
    mkdir pfam
    cd pfam
    mkdir out
    ls -1 ../splitProt > gene.list
    cat << '_EOF_' > doPfam
#!/bin/csh -ef
/san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/pfam/Pfam_fs $1 \
	> /scratch/tmp/mm9.$2
mv /scratch/tmp/mm9.$2 $3
'_EOF_'
    # << happy emacs
    chmod a+x doPfam
    cat << '_EOF_' > template
#LOOP
doPfam ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 gene.list single template jobList
    para create jobList
    para try ... check ... push ... etc... time
    #	after some kluster difficulties
Completed: 9666 of 9666 jobs
CPU time in finished jobs:    3535078s   58917.96m   981.97h   40.92d  0.112 y
IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
Average job time:                 287s       4.78m     0.08h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            3430s      57.17m     0.95h    0.04d
Submission to last job:         79051s    1317.52m    21.96h    0.91d

    # Make up pfamDesc.tab by converting pfam to a ra file first
    cat << '_EOF_' > makePfamRa.awk
/^NAME/ {print}
/^ACC/ {print}
/^DESC/ {print; printf("\n");}
'_EOF_'
    # << happy emacs

    awk -f makePfamRa.awk  /cluster/store12/pfam/Pfam_fs > pfamDesc.ra
    raToTab -cols=ACC,NAME,DESC pfamDesc.ra stdout | \
   awk -F '\t' '{
printf("%s\t%s\t%s\n", gensub(/\.[0-9]+/, "", "g", $1), $2, $3);
}' > pfamDesc.tab

    # Convert output to tab-separated file. 
    cd /cluster/data/mm9/bed/ucsc.10
    catDir /san/sanvol1/scratch/mm9/ucscGenes/pfam/out \
	| hmmPfamToTab -eValCol stdin ucscPfam.tab

    # Convert output to knownToPfam table
    awk '{printf("%s\t%s\n", $2, gensub(/\.[0-9]+/, "", "g", $1));}' \
	/san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab > sub.foo
    cut -f 1,4 ucscPfam.tab | subColumn 2 stdin sub.foo knownToPfam.tab
    hgLoadSqlTab mm9 knownToPfam ~/kent/src/hg/lib/knownTo.sql \
	knownToPfam.tab
    cut -f 1-4 ucscPfam.tab > load.ucscPfam.tab
    hgLoadSqlTab mm9 ucscPfam ~/kent/src/hg/lib/ucscPfam.sql load.ucscPfam.tab
    cp -p /san/sanvol1/scratch/mm9/ucscGenes/pfam/pfamDesc.tab .
    hgLoadSqlTab mm9 pfamDesc ~/kent/src/hg/lib/pfamDesc.sql pfamDesc.tab

#########################################################################
# Do scop run. Takes about 3.5 hours (DONE - 2007-09-12 - Hiram)
# First get pfam global HMMs into /san/sanvol1/scop somehow.
    ssh pk
    mkdir /san/sanvol1/scratch/mm9/ucscGenes/scop
    cd /san/sanvol1/scratch/mm9/ucscGenes/scop
    mkdir out
    ls -1 ../splitProt > gene.list
    cat << '_EOF_' > doScop
#!/bin/tcsh -ef
/san/sanvol1/pfam/hmmpfam -E 0.1 /san/sanvol1/scop/scop.hmm $1 \
	> /scratch/tmp/mm9.$2
mv /scratch/tmp/mm9.$2 $3
'_EOF_'
    chmod a+x doScop
    cat << '_EOF_' > template
#LOOP
doScop ../splitProt/$(path1) $(root1).pf {check out line out/$(root1).pf}
#ENDLOOP
'_EOF_'
    gensub2 gene.list single template jobList
    para create jobList
    para try ... check ... push ... etc... time
# Completed: 9666 of 9666 jobs
# CPU time in finished jobs:    3532425s   58873.76m   981.23h   40.88d  0.112 y
# IO & Wait Time:                     0s       0.00m     0.00h    0.00d  0.000 y
# Average job time:                 347s       5.78m     0.10h    0.00d
# Longest finished job:            6512s     108.53m     1.81h    0.08d
# Submission to last job:         12348s     205.80m     3.43h    0.14d


    # Convert scop output to tab-separated files
    ssh hgwdev
    cd /cluster/data/mm9/bed/ucsc.10
    catDir /san/sanvol1/scratch/mm9/ucscGenes/scop/out | \
	hmmPfamToTab -eValCol -scoreCol stdin scopPlusScore.tab
    scopCollapse scopPlusScore.tab /cluster/store12/scop/model.tab \
	ucscScop.tab scopDesc.tab knownToSuper.tab
    hgLoadSqlTab mm9 knownToSuper ~/kent/src/hg/lib/knownToSuper.sql \
	knownToSuper.tab

    hgLoadSqlTab mm9 ucscScop ~/kent/src/hg/lib/ucscScop.sql ucscScop.tab
    hgLoadSqlTab mm9 scopDesc ~/kent/src/hg/lib/scopDesc.sql scopDesc.tab

    # XXX - ccds is not yet available for Mm9 according to Mark
    # Regenerate ccdsKgMap table
    # /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap  -db=mm9 -loadDb \
    #	mm9.ccdsGene knownGene ccdsKgMap

    # Map old to new mapping - maybe next time, this is first genes on mm9
    # hgsql mm9 -N -e 'select * from knownGene' > knownGene_1.gp
    # genePredToBed knownGene_1.gp >knownGene_1.bed
    # cat refSeq/*.bed mrna/*.bed | txGeneExplainUpdate1 knownGene_1.bed \
    #	ucscGenes.bed stdin abWalk.bed kg2ToKg3.bed
    # hgLoadSqlTab $tempDb kg1ToKg2 ~/kent/src/hg/lib/kg2ToKg3.sql kg2ToKg3.bed

    # Build kgSpAlias table, which combines content of both kgAlias and kgProtAlias tables.

    hgsql mm9 -N -e \
    'select kgXref.kgID, spID, alias from kgXref, kgAlias where kgXref.kgID=kgAlias.kgID' >j.tmp
         
    hgsql mm9 -N -e \
    'select kgXref.kgID, spID, alias from kgXref, kgProtAlias where kgXref.kgID=kgProtAlias.kgID'\
    >>j.tmp
    sort -u j.tmp > kgSpAlias.tab
    rm j.tmp

    hgLoadSqlTab mm9 kgSpAlias ~/kent/src/hg/lib/kgSpAlias.sql ./kgSpAlias.tab

#########################################################################
# Building PROTEOME BROWSER TABLES (DONE - 2007-09-12 - Hiram)

# These are instructions for building tables 
# needed for the Proteome Browser. 
 
# DON'T START THESE UNTIL TABLES FOR KNOWN GENES AND kgProtMap2 table
# ARE REBUILT.  
# This build is based on proteins DBs dated 070202.

# Create the working directory

    ssh hgwdev
    mkdir /cluster/data/mm9/bed/ucsc.10/pb
    cd /cluster/data/mm9/bed/ucsc.10/pb

    # Build the pepMwAa table

    hgsql proteins070202 -N -e \
"select info.acc, molWeight, aaSize from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > pepMwAa.tab

    hgLoadSqlTab mm9 pepMwAa ~/kent/src/hg/lib/pepMwAa.sql ./pepMwAa.tab

    # Build the pepPi table

    hgsql proteins070202 -e \
    "select info.acc from sp070202.info, sp070202.accToTaxon where accToTaxon.taxon=10090 and accToTaxon.acc = info.acc" > protAcc.list

    hgsql mm9 -N \
-e 'select proteinID from knownGene where proteinID like "%-%"' \
	| sort -u >> protAcc.list

    pbCalPi protAcc.list sp070202 pepPi.tab
    hgLoadSqlTab mm9 pepPi ~/kent/src/hg/lib/pepPi.sql ./pepPi.tab

    # Calculate and load pep distributions

    pbCalDist sp070202 proteins070202 10090 mm9 
    hgLoadSqlTab mm9 pepExonCntDist ~/kent/src/hg/lib/pepExonCntDist.sql \
	./pepExonCntDist.tab
    hgLoadSqlTab mm9 pepCCntDist ~/kent/src/hg/lib/pepCCntDist.sql \
	./pepCCntDist.tab
    hgLoadSqlTab mm9 pepHydroDist ~/kent/src/hg/lib/pepHydroDist.sql \
	./pepHydroDist.tab
    hgLoadSqlTab mm9 pepMolWtDist ~/kent/src/hg/lib/pepMolWtDist.sql \
	./pepMolWtDist.tab
    hgLoadSqlTab mm9 pepResDist ~/kent/src/hg/lib/pepResDist.sql \
	./pepResDist.tab
    hgLoadSqlTab mm9 pepIPCntDist ~/kent/src/hg/lib/pepIPCntDist.sql \
	./pepIPCntDist.tab
    hgLoadSqlTab mm9 pepPiDist ~/kent/src/hg/lib/pepPiDist.sql ./pepPiDist.tab


# Calculate frequency distributions

    pbCalResStd sp070202 10090 mm9

# Create pbAnomLimit and pbResAvgStd tables

    hgLoadSqlTab mm9 pbAnomLimit ~/kent/src/hg/lib/pbAnomLimit.sql \
	./pbAnomLimit.tab
    hgLoadSqlTab mm9 pbResAvgStd ~/kent/src/hg/lib/pbResAvgStd.sql \
	./pbResAvgStd.tab

    hgsql -N -e "select * from pbStamp;" mm8 > pbStamp.tab
    hgLoadSqlTab mm9 pbStamp ~/kent/src/hg/lib/pbStamp.sql \
	./pbStamp.tab

    #	Turn on protein and gene sorter
    hgsql -e 'update dbDb set hgNearOk=1,hgPbOk=1 where name="mm9";' \
	hgcentraltest

# Add mm9 to gdbPdb, pointing to proteins070202

    mysql> insert into gdbPdb values('mm9','proteins070202');

############################################################################
# BUILD KNOWN GENE LIST FOR GOOGLE.   (DONE - 2007-10-03 - Hiram)

    cd /cluster/data/mm9/bed
    rm -rf knownGeneList/mm9

    # Run hgKnownGeneList to generate the tree of HTML pages
    # under ./knownGeneList/mm9

    hgKnownGeneList mm9

    # copy over to /usr/local/apache/htdocs

    rm -rf /usr/local/apache/htdocs/knownGeneList/mm9
    rsync -a --progress ./knownGeneList/mm9/ \
	/usr/local/apache/htdocs/knownGeneList/mm9/
    #	if this is a new listing, add it to the top level
    #	knownGeneLists.html file

############################################################################
# SGP GENES (DONE - 2007-10-01 - Hiram)
    ssh kkstore06
    mkdir  /cluster/data/mm9/bed/sgp
    cd  /cluster/data/mm9/bed/sgp

    #   They don't do chrM  (we could just let that on fail ...)
    for C in `awk '{print $1}' /cluster/data/mm9/chrom.sizes | grep -v chrM`
    do
        wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/SGP/humangp200603/${C}.gtf" \
        -O "${C}.gtf"
    done

    ssh hgwdev
    cd /cluster/data/mm9/bed/sgp
    ldHgGene -gtf -genePredExt mm9 sgpGene chr*.gtf
    #	Read 35983 transcripts in 290486 lines in 34 files
    #	35983 groups 32 seqs 1 sources 3 feature types
    #	35983 gene predictions

    featureBits mm9 -enrichment refGene:CDS sgpGene
    #	refGene:CDS 1.165%, sgpGene 1.439%, both 1.005%, cover 86.28%,
    #	enrich 59.96x
    featureBits mm8 -enrichment refGene:CDS sgpGene
    #	refGene:CDS 1.186%, sgpGene 1.455%, both 1.025%, cover 86.47%,
    #	enrich 59.42x
    featureBits mm9 -enrichment knownGene:CDS sgpGene
    #	knownGene:CDS 1.278%, sgpGene 1.439%, both 1.080%, cover 84.53%,
    #	enrich 58.74x
    featureBits mm8 -enrichment knownGene:CDS sgpGene
    #	knownGene:CDS 1.109%, sgpGene 1.455%, both 0.931%, cover 83.98%,
    #	enrich 57.71x

#####################################################################
# LOAD GENEID GENES (DONE - 2007-10-01 - Hiram)
    ssh kkstore06
    mkdir -p /cluster/data/mm9/bed/geneid/download
    cd /cluster/data/mm9/bed/geneid/download

    bash
    awk '{print $1}' ../../../chrom.sizes | while read C
    do
      echo $C
      wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.gtf" \
	-O ${C}.gtf
      wget --timestamping \
"http://genome.imim.es/genepredictions/M.musculus/mmJul2007/geneid_v1.2/${C}.prot" \
	-O ${C}.prot
    done
    exit

    # Add missing .1 to protein id's

    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    ssh hgwdev
    cd /cluster/data/mm9/bed/geneid
    ldHgGene -genePredExt -gtf mm9 geneid download/*.gtf
# Read 36708 transcripts in 287399 lines in 35 files
# 36708 groups 34 seqs 1 sources 3 feature types
# 36708 gene predictions

    #	the chr16_random file is empty, do not attempt to use it
    hgPepPred mm9 generic geneidPep \
	`ls download/*-fixed.prot | grep -v chr16_random`
    featureBits mm9 -enrichment refGene geneid
# refGene 1.975%, geneid 1.590%, both 0.956%, cover 48.39%, enrich 30.44x
    featureBits mm8 -enrichment refGene geneid
# refGene 2.010%, geneid 1.592%, both 0.974%, cover 48.44%, enrich 30.43x
    featureBits mm7 -enrichment refGene geneid
# refGene 2.002%, geneid 1.579%, both 0.952%, cover 47.57%, enrich 30.12x

    featureBits mm9 -enrichment knownGene geneid
# knownGene 2.686%, geneid 1.590%, both 1.047%, cover 38.97%, enrich 24.52x
    featureBits mm8 -enrichment knownGene geneid
# knownGene 2.130%, geneid 1.592%, both 0.900%, cover 42.23%, enrich 26.53x
    featureBits mm7 -enrichment knownGene geneid
# knownGene 2.058%, geneid 1.579%, both 0.859%, cover 41.72%, enrich 26.42x

#########################################################################
# BLASTZ/CHAIN/NET Orangutan ponAbe2 (DONE - 2007-09-21 - Hiram)
    ssh kkstore02
    #	use a screen to control this job
    screen
    mkdir /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
    cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19

    cat << '_EOF_' > DEF
# mouse vs orangutan
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY: Orangutan ponAbe2
SEQ2_DIR=/cluster/bluearc/scratch/data/ponAbe2/ponAbe2.2bit
SEQ2_LEN=/cluster/data/ponAbe2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-stop=load -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > do.log 2>&1 &
    #	real    62m34.156s
    #	some pk kluster difficulties, fixup and complete manually
# Completed: 104880 of 104880 jobs
# CPU time in finished jobs:    7142978s  119049.64m  1984.16h   82.67d  0.227 y
# IO & Wait Time:                556393s    9273.21m   154.55h    6.44d  0.018 y
# Average job time:                  73s       1.22m     0.02h    0.00d
# Longest finished job:             507s       8.45m     0.14h    0.01d
# Submission to last job:         65973s    1099.55m    18.33h    0.76d
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=cat -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > cat.log 2>&1 &
    #	real    166m20.442s
    cat fb.mm9.chainPonAbe2Link.txt
    #	914561309 bases of 2620346127 (34.902%) in intersection

    #	And, for the swap
    mkdir /cluster/data/ponAbe2/bed/blastz.mm9.swap
    cd /cluster/data/ponAbe2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPonAbe2.2007-09-19/DEF \
	-chainMinScore=3000 -swap -chainLinearGap=medium \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    102m23.209s
    cat fb.ponAbe2.chainMm9Link.txt
    #	948458190 bases of 3093572278 (30.659%) in intersection

    # create the syntenic maf nets:
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzPonAbe2.2007-09-19
    time nice -n +19 doBlastzChainNet.pl -verbose=2 `pwd`/DEF \
	-continue=syntenicNet -syntenicNet -chainMinScore=3000 \
	-chainLinearGap=medium -bigClusterHub=pk > syntenicNet.log 2>&1 &
    #	real    22m16.544s

########################################################################
# BLASTZ/CHAIN/NET Frog X. tropicalis xenTro2 (DONE - 2007-09-23 - Hiram)
    ssh kkstore04
    screen # use screen to manage this job
    # XXX note for next time, missing the TMPDIR in the DEF file
    mkdir /cluster/data/mm9/bed/blastzXenTro2.2007-09-19
    cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19

    cat << '_EOF_' > DEF
# Mouse (mm9) vs frog (xenTro2)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/cluster/data/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=10000

BASE=/cluster/data/mm9/bed/blastzXenTro2.2007-09-19
'_EOF_'
    # << emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    1050m55.259s
    # after kk difficulties, finishing the first kluster run manually

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=kk -chainMinScore=5000 -chainLinearGap=loose \
	`pwd`/DEF > blastz.out 2>&1 &
# Completed: 126539 of 126540 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:   15750656s  262510.93m  4375.18h  182.30d  0.499 y
# IO & Wait Time:                843281s   14054.69m   234.24h    9.76d  0.027 y
# Average job time:                 131s       2.19m     0.04h    0.00d
# Longest finished job:            2039s      33.98m     0.57h    0.02d
# Submission to last job:         79275s    1321.25m    22.02h    0.92d

    #	A single job kept having trouble, finished it on kolossus:
    ssh kolossus
    cd /cluster/data/mm9/bed/blastzXenTro2.2007-09-19/run.blastz
time nice -n +19 /cluster/bin/scripts/blastz-run-ucsc -outFormat psl \
/scratch/data/mm9/mm9.2bit:chr2:80000000-90000000 qParts/part008.lst ../DEF \
../psl/mm9.2bit:chr2:80000000-90000000/mm9.2bit:chr2:80000000-90000000_part008.lst.psl
    #	continuing after that
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-continue=cat -bigClusterHub=kk -chainMinScore=5000 \
	-chainLinearGap=loose `pwd`/DEF > cat.out 2>&1 &
    #	real    62m17.627s
    cat fb.mm9.chainXenTro2Link.txt
    #	82054987 bases of 2620346127 (3.131%) in intersection

    #	Then to swap over to xenTro2
    mkdir /cluster/data/xenTro2/bed/blastz.mm9.swap
    cd /cluster/data/xenTro2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-swap -bigClusterHub=kk -chainMinScore=5000 \
	/cluster/data/mm9/bed/blastzXenTro2.2007-09-19/DEF \
	-chainLinearGap=loose > swap.out 2>&1 &
    #	real    47m53.428s

    ssh hgwdev
    cd /cluster/data/mm9/bed/blastz.xenTro2.2007-09-19
    time nice -n +19 featureBits mm9 chainXenTro2Link \
	> fb.mm9.chainXenTro2Link 2>&1 &
    #	68050843 bases of 2567283971 (2.651%) in intersection
    cd /cluster/data/xenTro2/bed/blastz.mm9.swap
    time nice -n +19 featureBits xenTro2 chainMm8Link \
	> fb.xenTro2.chainMm8Link 2>&1
    #	72840135 bases of 1359412157 (5.358%) in intersection

#########################################################################
## BLASTZ Lizard anoCar1 - (DONE - 2007-09-21 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
    cd /cluster/data/mm9/bed/blastzAnoCar1.2007-09-19

    cat << '_EOF_' > DEF
# Mouse (mm9) vs lizard (anoCar1)
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY: Lizard AnoCar1 - largest chunk big enough for largest scaffold
SEQ2_DIR=/san/sanvol1/scratch/anoCar1/anoCar1.2bit
SEQ2_LEN=/cluster/data/anoCar1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=30
SEQ2_LAP=10000

BASE=/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-qRepeats=windowmaskerSdust \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    911m49.918s
    # after kk difficulties, finishing the first kluster run manually
# Completed: 86355 of 86355 jobs
# CPU time in finished jobs:   11171051s  186184.18m  3103.07h  129.29d  0.354 y
# IO & Wait Time:                662082s   11034.70m   183.91h    7.66d  0.021 y
# Average job time:                 137s       2.28m     0.04h    0.00d
# Longest finished job:            1467s      24.45m     0.41h    0.02d
# Submission to last job:         62938s    1048.97m    17.48h    0.73d
    #	continuing
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	DEF -chainMinScore=5000 \
	-continue=cat -qRepeats=windowmaskerSdust \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > cat.log 2>&1 &
    #	real    31m44.652s
    cat  fb.mm9.chainAnoCar1Link.txt
    #	89239796 bases of 2620346127 (3.406%) in intersection

    #	and for the swap
    mkdir /cluster/data/anoCar1/bed/blastz.mm9.swap
    cd /cluster/data/anoCar1/bed/blastz.mm9.swap
    time nice -n +19 ~/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	/cluster/data/mm9/bed/blastzAnoCar1.2007-09-19/DEF -chainMinScore=5000 \
	-swap -qRepeats=windowmaskerSdust \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > swap.log 2>&1 &
    #	real    29m12.291s
    cat fb.anoCar1.chainMm9Link.txt
    #	85923556 bases of 1741478929 (4.934%) in intersection

#########################################################################
# BLASTZ Chicken galGal3 (DONE - 2007-09-25 - Hiram)
    ssh kkstore03
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzGalGal3.2007-09-21
    cd /cluster/data/mm9/bed/blastzGalGal3.2007-09-21

    # This partitioning is too large to run on kk, must run this on pk
    #	or change the partitioning

    cat << '_EOF_' > DEF
# mouse vs chicken

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken galGal3 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/hg/galGal3/nib
SEQ2_LEN=/cluster/data/galGal3/chrom.sizes
SEQ2_SMSK=/san/sanvol1/galGal3/linSpecRep
SEQ2_CHUNK=200000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzGalGal3.2007-09-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    587m53.468s
# Completed: 16680 of 17168 jobs
# Crashed: 488 jobs
# CPU time in finished jobs:    7758569s  129309.48m  2155.16h   89.80d  0.246 y
# IO & Wait Time:                190128s    3168.80m    52.81h    2.20d  0.006 y
# Average job time:                 477s       7.94m     0.13h    0.01d
# Longest finished job:            6501s     108.35m     1.81h    0.08d
# Submission to last job:        271554s    4525.90m    75.43h    3.14d
    #	the kk cluster could not complete some of these jobs.  A recovery job
    #	list was created from the remaining jobs and completed on pk
# Completed: 488 of 488 jobs
# CPU time in finished jobs:    1226144s   20435.73m   340.60h   14.19d  0.039 y
# IO & Wait Time:                  6875s     114.58m     1.91h    0.08d  0.000 y
# Average job time:                2527s      42.11m     0.70h    0.03d
# Longest finished job:            3872s      64.53m     1.08h    0.04d
# Submission to last job:         11739s     195.65m     3.26h    0.14d
    #	continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-continue=cat -chainLinearGap=loose -bigClusterHub=pk -verbose=2 \
	> cat.log 2>&1 &
    #	real    18m35.814s
    cat fb.mm9.chainGalGal3Link.txt
    #	97711788 bases of 2620346127 (3.729%) in intersection

    #	and the swap
    mkdir /cluster/data/galGal3/bed/blastz.mm9.swap
    cd /cluster/data/galGal3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
	/cluster/data/mm9/bed/blastzGalGal3.2007-09-21/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=pk  > swap.log 2>&1 &
    #	real    12m54.737s
    cat fb.galGal3.chainMm9Link.txt
    #	84990797 bases of 1042591351 (8.152%) in intersection

#########################################################################
# BLASTZ Platypus ornAna1 - (DONE - 2007-09-21 - 2007-09-25 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
    cd /cluster/data/mm9/bed/blastzOrnAna1.2007-09-21

    cat << '_EOF_' > DEF
# mouse vs. platypus
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=0

# QUERY: ornAna1
SEQ2_DIR=/iscratch/i/ornAna1/ornAna1.2bit
SEQ2_LEN=/cluster/data/ornAna1/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=kk -verbose=2 > do.log 2>&1 &
    #	real    912m18.732s
    cat fb.mm9.chainOrnAna1Link.txt
    #	141953739 bases of 2620346127 (5.417%) in intersection

    #	and the swap
    mkdir /cluster/data/ornAna1/bed/blastz.mm9.swap
    cd /cluster/data/ornAna1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
	/cluster/data/mm9/bed/blastzOrnAna1.2007-09-21/DEF \
	-swap -chainLinearGap=loose -bigClusterHub=kk > swap.log 2>&1 &
    #	real    123m16.632s
    cat fb.ornAna1.chainMm9Link.txt
    #	135570580 bases of 1842236818 (7.359%) in intersection

#########################################################################
# Blastz Chimp panTro2 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
    ssh kkstore04
    mkdir /cluster/data/mm9/bed/blastzPanTro2.2007-09-24
    cd /cluster/data/mm9/bed/blastzPanTro2.2007-09-24

    cat << '_EOF_' > DEF
# Mouse vs Chimp
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_SMSK=/scratch/data/mm9/notInOthers
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chimp PanTro2
SEQ2_DIR=/scratch/hg/panTro2/nib
SEQ2_LEN=/cluster/data/panTro2/chrom.sizes
SEQ2_SMSK=/cluster/bluearc/panTro2/linSpecRep/notInRodent
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzPanTro2.2007-09-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	DEF > blastz.out 2>&1 &
    #	real    701m23.446s
    cat fb.mm9.chainPanTro2Link.txt
    #	987180081 bases of 2620346127 (37.674%) in intersection

    #	and the swap
    mkdir /cluster/data/panTro2/bed/blastz.mm9.swap
    cd /cluster/data/panTro2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPanTro2.2007-09-24/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-swap > swap.log 2>&1 &
    #	real    87m25.448s
    cat fb.panTro2.chainMm9Link.txt
    #	997050630 bases of 2909485072 (34.269%) in intersection

    #	create syntenic maf nets:
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-syntenicNet -continue=syntenicNet DEF > syntenicNet.out 2>&1 &
    #	real 25m13.118s

#########################################################################
# Blastz Horse equCab1 - (DONE - 2007-09-24 - 2007-09-25 - Hiram)
    ssh kkstore05
    mkdir /cluster/data/mm9/bed/blastzEquCab1.2007-09-24
    cd /cluster/data/mm9/bed/blastzEquCab1.2007-09-24

    cat << '_EOF_' > DEF
# Mouse vs Horse

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse EquCab1
SEQ2_DIR=/san/sanvol1/scratch/equCab1/equCab1.2bit
SEQ2_LEN=/cluster/data/equCab1/chrom.sizes       
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzEquCab1.2007-09-24
TMPDIR=/scratch/tmp
'_EOF_'
    # << emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	DEF > blastz.out 2>&1 &
    #	real    1582m34.597s
    cat fb.mm9.chainEquCab1Link.txt
    #	911418189 bases of 2620346127 (34.782%) in intersection

    #	and the swap
    mkdir /cluster/data/equCab1/bed/blastz.mm9.swap
    cd /cluster/data/equCab1/bed/blastz.mm9.swap

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	/cluster/data/mm9/bed/blastzEquCab1.2007-09-24/DEF \
	-swap -bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	> swap.out 2>&1 &
    #	real ~110m
     cat fb.equCab1.chainMm9Link.txt
    #	901367656 bases of 2421923695 (37.217%) in intersection

    #	create the syntenic maf nets
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	-continue=syntenicNet -syntenicNet DEF > syntenicNet.out 2>&1 &
    #	real 29m40.546s

#########################################################################
# Blastz Cow bosTau3 (DONE - 2007-09-25 - Hiram)
    ssh kkstore05
    screen # use a screen to control this job
    mkdir /cluster/data/mm9/bed/blastzBosTau3.2007-09-25
    cd /cluster/data/mm9/bed/blastzBosTau3.2007-09-25

    cat << '_EOF_' > DEF
# Mouse vs Cow

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau3
SEQ2_DIR=/san/sanvol1/scratch/bosTau3/bosTau3.2bit
SEQ2_LEN=/cluster/data/bosTau3/chrom.sizes
SEQ2_LIMIT=100
SEQ2_CHUNK=50000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzBosTau3.2007-09-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
	DEF -bigClusterHub=pk -chainLinearGap=medium > do.log 2>&1 &
    #	real    733m40.065s
    cat fb.mm9.chainBosTau3Link.txt
    #	690515959 bases of 2620346127 (26.352%) in intersection

    #	and for the swap
    mkdir /cluster/data/bosTau3/bed/blastz.mm9.swap
    cd /cluster/data/bosTau3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
	/cluster/data/mm9/bed/blastzBosTau3.2007-09-25/DEF \
	-swap -bigClusterHub=pk -chainLinearGap=medium > swap.log 2>&1 &
    #	real    100m20.707s
    cat fb.bosTau3.chainMm9Link.txt
    #	707779988 bases of 2731807384 (25.909%) in intersection

    #	create the syntenic maf nets
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=3000 -verbose=2 \
	-syntenicNet -continue=syntenicNet \
	DEF -bigClusterHub=pk -chainLinearGap=medium > syntenicNet.log 2>&1 &
    #	real 16m28.741s

#########################################################################
# Blastz Opossum monDom4 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
    ssh kkstore04
    screen # use screen to manage this job
    mkdir /cluster/data/mm9/bed/blastzMonDom4.2007-09-25
    cd /cluster/data/mm9/bed/blastzMonDom4.2007-09-25

    #	the opossum chroms are too large to work with on the kk, must run this
    #	on the pk kluster
    cat << '_EOF_' > DEF
# Mouse vs. opossum

BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Opossum monDom4
SEQ2_DIR=/scratch/hg/monDom4/monDom4.2bit
SEQ2_LEN=/cluster/data/monDom4/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzMonDom4.2007-09-25
TMPDIR=/scratch/tmp
'_EOF'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-chainLinearGap=loose -bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    811m19.320s
    # problem on kki run, monDom4 wasn't distributed on the Iservers to
    #	/scratch/hg/monDom4/ - straighten that up, and finish that run, then
    #	continuing
    time nice -n +19 doBlastzChainNet.pl DEF -chainMinScore=5000 \
	-continue=chainMerge -chainLinearGap=loose \
	-bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
    #	real    158m9.287s
    cat fb.mm9.chainMonDom4Link.txt
    #	255535025 bases of 2620346127 (9.752%) in intersection

    #	and for the swap
    mkdir /cluster/data/monDom4/bed/blastz.mm9.swap
    cd /cluster/data/monDom4/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -chainMinScore=5000 -verbose=2 \
	/cluster/data/mm9/bed/blastzMonDom4.2007-09-25/DEF \
	-swap -chainLinearGap=loose \
	-bigClusterHub=pk > swap.log 2>&1 &
    #	real    59m19.005s
    cat  fb.monDom4.chainMm9Link.txt
    #	254018516 bases of 3501643220 (7.254%) in intersection

#########################################################################
# Blastz Tenrec echTel1 (DONE - 2007-09-25 - 2007-09-27 - Hiram)
    ssh kkstore02
    screen # use a screen to control this job
    mkdir /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
    cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25

    cat << '_EOF_' > DEF
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Tenrec echTel1
SEQ2_DIR=/scratch/hg/echTel1/echTel1.2bit
SEQ2_LEN=/scratch/hg/echTel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=800
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzEchTel1.2007-09-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-bigClusterHub=kk -chainLinearGap=medium DEF > do.log 2>&1 &
    #	real    2721m33.204s
    cat fb.mm9.chainEchTel1Link.txt
    #	291920039 bases of 2620346127 (11.141%) in intersection

    #	and for the swap
    mkdir /cluster/data/echTel1/bed/blastz.mm9.swap
    cd /cluster/data/echTel1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	/cluster/data/mm9/bed/blastzEchTel1.2007-09-25/DEF \
	-swap -bigClusterHub=kk -chainLinearGap=medium > swap.log 2>&1 &
    #	real    520m9.198s
    cat  fb.echTel1.chainMm9Link.txt
    #	298656963 bases of 2111581369 (14.144%) in intersection

    #	create syntenic maf nets
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	DEF -continue=syntenicNet -bigClusterHub=kk \
	-syntenicNet -chainLinearGap=medium > syntenicNet.log 2>&1 &
    #	real 3m4.285s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzEchTel1.2007-09-25
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 echTel1 \
	> rbest.log 2>&1 &
    #	real    34m12.936s

#########################################################################
# Blastz Tree Shrew tupBel1 (DONE - 2007-09-27 - 2007-10-01 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
    cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27

    cat << '_EOF_' > DEF
# Mouse vs. Tree Shrew
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY:  Tree shrew tupBel1
SEQ2_DIR=/san/sanvol1/scratch/tupBel1/tupBel1.2bit
SEQ2_LEN=/cluster/data/tupBel1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzTupBel1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
	> chainMerge.log 2>&1 &
    #	real    1262m32.699s
    #	the load should fail due to missing repeat masker tables in tupBel1
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> net.log 2>&1 &
    #	real    69m41.901s
    #	and indeed it did,  Loading the net track
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27/axtChain
    cp -p noClass.net mm9.tupBel1.net
    time nice -n +19 netFilter -minGap=10 mm9.tupBel1.net \
	| hgLoadNet -warn mm9 netTupBel1 stdin
    cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
    time nice -n +19 featureBits mm9 chainTupBel1Link \
	> fb.mm9.chainTupBel1Link.txt 2>&1 &
    cat fb.mm9.chainTupBel1Link.txt
    #	552865662 bases of 2620346127 (21.099%) in intersection

    #	and, to finish it all off, with syntenic net
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk \
	-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
    #	real    14m42.816s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzTupBel1.2007-09-27
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 tupBel1 \
	> rbest.log 2>&1 &
    #	real    41m12.278s

#########################################################################
# Blastz Bush Baby otoGar1 (DONE - 2007-09-27 - 2007-09-28 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
    cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27

    cat << '_EOF_' > DEF
# Mouse vs. Tree Shrew
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY:  Bush baby otoGar1
SEQ2_DIR=/san/sanvol1/scratch/otoGar1/otoGar1.2bit
SEQ2_LEN=/cluster/data/otoGar1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
	> chainMerge.log 2>&1 &
    #	real    873m23.531s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> net.log 2>&1 &
    #	real    67m7.172s
    cat fb.mm9.chainOtoGar1Link.txt
    #	601932945 bases of 2620346127 (22.972%) in intersection

    #	and run the syntenicNet and cleanup
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	real 13m57.573s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzOtoGar1.2007-09-27
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 otoGar1 \
	> rbest.log 2>&1 &
    #	real    40m1.428s

#########################################################################
# Blastz Armadillo dasNov1 (DONE - 2007-09-27 - 2007-10-02 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
    cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27

    cat << '_EOF_' > DEF
# Mouse vs. Armadillo
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Armadillo dasNov1
SEQ2_DIR=/scratch/hg/dasNov1/dasNov1.2bit
SEQ2_LEN=/cluster/data/dasNov1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzDasNov1.2007-09-27
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	real    3607m35.169s
    cat fb.mm9.chainDasNov1Link.txt
    #	433593082 bases of 2620346127 (16.547%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	real    15m7.642s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzDasNov1.2007-09-27
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 dasNov1 \
	> rbest.log 2>&1 &
    #	real    39m18.156s

#########################################################################
# Blastz Rabbit oryCun1 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
    cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28

    cat << '_EOF_' > DEF
# Mouse vs. Rabbit
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rabbit oryCun1
SEQ2_DIR=/scratch/hg/oryCun1/oryCun1.2bit
SEQ2_LEN=/cluster/data/oryCun1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzOryCun1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
	> chainMerge.log 2>&1 &
    #	real    2126m59.162s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	real    53m28.279s
    cat fb.mm9.chainOryCun1Link.txt
    #	496428446 bases of 2620346127 (18.945%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	real 9m27.321s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzOryCun1.2007-09-28
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 oryCun1 \
	> rbest.log 2>&1 &
    #	real    37m32.151s

#########################################################################
# Blastz Cat felCat3 (DONE - 2007-09-28 - 2007-09-29 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
    cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28

    cat << '_EOF_' > DEF
# Mouse vs. Cat
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cat felCat3
SEQ2_DIR=/san/sanvol1/scratch/felCat3/felCat3.2bit
SEQ2_LEN=/cluster/data/felCat3/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzFelCat3.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
	> chainMerge.log 2>&1 &
    #	real    1597m21.032s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	real    39m30.078s
    cat fb.mm9.chainFelCat3Link.txt
    #	499894253 bases of 2620346127 (19.077%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	real 9m42.624s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzFelCat3.2007-09-28
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 felCat3 \
	> rbest.log 2>&1 &
    #	real    36m40.000s

#########################################################################
# Blastz Elephant loxAfr1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
    ssh kkstore04
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
    cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28

    cat << '_EOF_' > DEF
# Mouse vs. Elephant
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Elephant loxAfr1
SEQ2_DIR=/scratch/hg/loxAfr1/loxAfr1.2bit
SEQ2_LEN=/cluster/data/loxAfr1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	real    2981m3.302s
    #	had two failed jobs in that state where their results existed,
    #	but parasol thought they were not done.  Continuing, and now
    #	all the way to syntenicNet.  Will probably fail during the load
    #	since not everything is there for db loxAfr1
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=cat -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	real    166m4.710s
    #	it did get through everything to a successful completion
    cat fb.mm9.chainLoxAfr1Link.txt
    #	473014688 bases of 2620346127 (18.052%) in intersection

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzLoxAfr1.2007-09-28
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 loxAfr1 \
	> rbest.log 2>&1 &
    #	real    41m56.201s

#########################################################################
# Blastz Hedgehog eriEur1 (DONE - 2007-09-28 - 2007-10-02 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
    cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28

    cat << '_EOF_' > DEF
# Mouse vs. Hedgehog
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Hedgehog eriEur1
SEQ2_DIR=/san/sanvol1/scratch/eriEur1/eriEur1.2bit
SEQ2_LEN=/cluster/data/eriEur1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzEriEur1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	failed during the load since the db eriEur1 does not exist
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28/axtChain
    cp -p noClass.net mm9.eriEur1.net
    time nice -n +19 netFilter -minGap=10 mm9.eriEur1.net \
	| hgLoadNet -warn mm9 netEriEur1 stdin
    cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
    time nice -n +19 featureBits mm9 chainEriEur1Link \
	> fb.mm9.chainEriEur1Link.txt 2>&1 &
    cat fb.mm9.chainEriEur1Link.txt
    #	262604655 bases of 2620346127 (10.022%) in intersection

    # continuing through syntenic nets (actually unneeded)
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk -chainLinearGap=medium DEF \
	-syntenicNet > syntenicNet.log 2>&1 &

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzEriEur1.2007-09-28
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 eriEur1 \
	> rbest.log 2>&1 &
    #	real    33m27.296s

#########################################################################
# Blastz Shrew sorAra1 (DONE - 2007-09-28 - 2007-10-01 - Hiram)
    ssh kkstore05
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
    cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28

    cat << '_EOF_' > DEF
# Mouse vs. Shrew
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Shrew sorAra1
SEQ2_DIR=/san/sanvol1/scratch/sorAra1/sorAra1.2bit
SEQ2_LEN=/cluster/data/sorAra1/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzSorAra1.2007-09-28
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-stop=chainMerge -bigClusterHub=pk -chainLinearGap=medium DEF \
	>chainMerge chainMerge.log 2>&1 &
    #	real    2478m57.242s

    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=net -stop=load -bigClusterHub=pk -chainLinearGap=medium DEF \
	> load.log 2>&1 &
    #	real    15m55.272s
    #	as expected, fails during load since there is no sorAra1 database
    #	load nets without class
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28/axtChain
    cp -p noClass.net mm9.sorAra1.net
    time nice -n +19 netFilter -minGap=10 mm9.sorAra1.net \
	| hgLoadNet -warn mm9 netSorAra1 stdin
    cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
    time nice -n +19 featureBits mm9 chainSorAra1Link \
	> fb.mm9.chainSorAra1Link.txt 2>&1
    cat fb.mm9.chainSorAra1Link.txt
    #	250412778 bases of 2620346127 (9.556%) in intersection

    #	and, to finish it all off, with syntenic net
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -chainMinScore=3000 \
	-continue=download -bigClusterHub=pk \
	-syntenicNet -chainLinearGap=medium DEF > syntenicNet.log 2>&1 &
    #	real    3m49.961s

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastzSorAra1.2007-09-28
    time nice -n +19 /cluster/bin/scripts/doRecipBest.pl mm9 sorAra1 \
	> rbest.log 2>&1 &
    #	real    27m3.076s

#########################################################################
## 30-Way Multiz (DONE - 2007-10-01 - Hiram)
##	The blastz alignments for this 30-way are documented at:
##	http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
##
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/multiz30way
    cd /cluster/data/mm9/bed/multiz30way
    #	take the 28-way tree from hg18 and insert the two new genomes.
    #	rearrange to get mm9 on the top of the graph
    #	paste this tree into the on-line phyloGif tool:
    #	http://genome.ucsc.edu/cgi-bin/phyloGif
    #	to create the image for the tree diagram

    cat << '_EOF_' > mm9OnTop.fullNames.nh
((((((((

 (((Mouse_mm9:0.076274,Rat_rn4:0.084383):0.200607,
    GuineaPig_cavPor2:0.202990):0.034350,
        Rabbit_oryCun1:0.208548):0.014587,

((((((Human_hg18:0.005873,Chimp_panTro2:0.007668):0.013037,
   Orangutan_ponAbe2:0.02):0.013037,Rhesus_rheMac2:0.031973):0.0365,
        Marmoset_calJac1:0.07):0.0365,Bushbaby_otoGar1:0.151185):0.015682,
           TreeShrew_tupBel1:0.162844):0.006272):0.019763,

 ((Shrew_sorAra1:0.248532,Hedgehog_eriEur1:0.222255):0.045693,

 (((Dog_canFam2:0.101137,Cat_felCat3:0.098203):0.048213,
    Horse_equCab1:0.099323):0.007287,
        Cow_bosTau3:0.163945):0.012398):0.018928):0.030081,

 (Armadillo_dasNov1:0.133274,(Elephant_loxAfr1:0.103030,
        Tenrec_echTel1:0.232706):0.049511):0.008424):0.213469,

 Opossum_monDom4:0.320721):0.088647,
    Platypus_ornAna1:0.488110):0.118797,
        (Chicken_galGal3:0.395136,Lizard_anoCar1:0.513962):0.093688):0.151358,
            Frog_xenTro2:0.778272):0.174596,

 (((Tetraodon_tetNig1:0.203933,Fugu_fr2:0.239587):0.203949,
    (Stickleback_gasAcu1:0.314162,Medaka_oryLat1:0.501915):0.055354):0.346008,
Zebrafish_danRer5:0.730028):0.174596);
'_EOF_'
    # << happy emacs
    
    #	create a species list from that file:
    sed -e 's/[()]//g; s/ /\n/g; s/,/\n/g' mm9OnTop.fullNames.nh \
        | sed -e "s/[ \t]*//g; /^[ \t]$/d; /^$/d" | sort -u \
        | sed -e "s/.*_//; s/:.*//" | sort > species.list
    #	verify that has 30 db names in it
    # create a stripped down nh file for use in autoMZ run
    echo \
`sed 's/[a-zA-Z0-9]*_//g; s/:0.[0-9]*//g; s/[,;]/ /g' mm9OnTop.fullNames.nh \
	| sed -e "s/  / /g"` > tree.30.nh
    #	that looks like, as a single line:
(((((((( (((mm9 rn4) cavPor2) oryCun1) ((((((hg18 panTro2) ponAbe2) rheMac2)
calJac1) otoGar1) tupBel1)) ((sorAra1 eriEur1) (((canFam2 felCat3) equCab1)
bosTau3))) (dasNov1 (loxAfr1 echTel1))) monDom4) ornAna1) (galGal3 anoCar1))
xenTro2) (((tetNig1 fr2) (gasAcu1 oryLat1)) danRer5))

    # verify all blastz's exists
    cat << '_EOF_' > listMafs.csh
#!/bin/csh -fe
cd /cluster/data/mm9/bed/multiz30way
foreach db (`cat species.list`)
    set bdir = /cluster/data/mm9/bed/blastz.$db
    if (-e $bdir/mafRBestNet/chr1.maf.gz) then
	echo "$db mafRBestNet"
    else if (-e $bdir/mafSynNet/chr1.maf.gz) then
	echo "$db mafSynNet"
    else if (-e $bdir/mafNet/chr1.maf.gz) then
	echo "$db mafNet"
    else
	echo "$db mafs not found"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./listMafs.csh
    #	see what it says, shouldn't be anything with "mafs not found"
    ./listMafs.csh

    # copy net mafs to cluster-friendly storage, splitting chroms
    # into 50MB chunks  to improve run-time
    # NOTE: splitting will be different for scaffold-based reference asemblies
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/multiz30way/run.split
    cd /cluster/data/mm9/bed/multiz30way/run.split
    #	this works by examining the rmsk table for likely repeat areas
    #	that won't be used in blastz
    mafSplitPos mm9 50 mafSplit.bed

    ssh kki
    cd /cluster/data/mm9/bed/multiz30way/run.split
 
    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set db = $1
set sdir = /san/sanvol1/scratch/mm9/splitStrictMafNet
mkdir -p $sdir
if (-e $sdir/$db) then
    echo "directory $sdir/$db already exists -- remove and retry"
    exit 1
endif
set bdir = /cluster/data/mm9/bed/blastz.$db
if (! -e $bdir) then
    echo "directory $bdir not found"
    exit 1
endif
mkdir -p $sdir/$db
if (-e $bdir/mafRBestNet) then
    set mdir = $bdir/mafRBestNet
else if (-e $bdir/mafSynNet) then
    set mdir = $bdir/mafSynNet
else if (-e $bdir/mafNet) then
    set mdir = $bdir/mafNet
else
    echo "$bdir maf dir not found"
    exit 1
endif
echo $mdir
foreach f ($mdir/*)
    set c = $f:t:r:r
    echo "  $c"
    nice mafSplit mafSplit.bed $sdir/$db/ $f
end
echo "gzipping $sdir/$db mafs"
nice gzip $sdir/$db/*
endif
echo $mdir > $db.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    grep -v mm9  ../species.list > split.list
    cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out line+ $(path1).done}
#ENDLOOP
'_EOF_'
    gensub2 split.list single template jobList
    para create jobList
    # 29 jobs
    # start these gently, this is a good load on the san filesystem
    para try
    # let that run to a couple completions, a few minutes, then again:
    para try
    # etc ...
# Completed: 29 of 29 jobs
# CPU time in finished jobs:       9476s     157.94m     2.63h    0.11d  0.000 y
# IO & Wait Time:                  1531s      25.51m     0.43h    0.02d  0.000 y
# Average job time:                 380s       6.33m     0.11h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1081s      18.02m     0.30h    0.01d
# Submission to last job:          1391s      23.18m     0.39h    0.02d

    # ready for the multiz run
    ssh pk
    cd /cluster/data/mm9/bed/multiz30way
    #	actually, the result directory here should be maf.split instead of maf
    mkdir -p maf run
    cd run
    mkdir penn
    # use latest penn utilities
    P=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $P/{autoMZ,multiz,maf_project} penn

    # list chrom chunks, any db dir will do; better would be for the
    # splitter to generate this file
    # We temporarily use __ instead of . to delimit chunk in filename
    # so we can use $(root) to get basename
    find /san/sanvol1/scratch/mm9/splitStrictMafNet -type f \
	| while read F; do basename $F; done \
	| sed -e 's/.maf.gz//' -e 's/\./__/' | sort -u > chromChunks.lst
	sort -u > chromChunks.list
    wc -l chromChunks.list
        # 75

cat > autoMultiz.csh << '_EOF_'
#!/bin/csh -ef

    set db = mm9
    set c = $1
    set maf = $2
    set run = `pwd`
    set tmp = /scratch/tmp/$db/multiz.$c
    set pairs = /san/sanvol1/scratch/$db/splitStrictMafNet
    rm -fr $tmp
    mkdir -p $tmp
    cp ../tree.30.nh ../species.list $tmp
    pushd $tmp
    foreach s (`cat species.list`)
        set c2 = `echo $c | sed 's/__/./'`
        set in = $pairs/$s/$c2.maf
        set out = $db.$s.sing.maf
        if ($s == mm9) then
            continue
        endif
        if (-e $in.gz) then
            zcat $in.gz > $out
        else if (-e $in) then
            cp $in $out
        else
            echo "##maf version=1 scoring=autoMZ" > $out
        endif
    end
    set path = ($run/penn $path); rehash
    $run/penn/autoMZ + T=$tmp E=$db "`cat tree.30.nh`" $db.*.sing.maf $c.maf
    popd
    cp $tmp/$c.maf $maf
    rm -fr $tmp
'_EOF_'
# << happy emacs
    chmod +x autoMultiz.csh

cat  << '_EOF_' > template
#LOOP
./autoMultiz.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << emacs
    gensub2 chromChunks.list single template jobList
    para create jobList
    # 75 jobs
    #	three of these jobs failed with memory allocation error:
# maf_project.v12: Ran out of memory trying to allocate 64.
# autoMZ.v1: command 'maf_project /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_lef
# t.maf19 mm9 > /scratch/tmp/mm9/multiz.chr10__1/_MZ_16482_U1' failed
    # the 73 jobs run time:
# Completed: 72 of 75 jobs
# CPU time in finished jobs:     501143s    8352.38m   139.21h    5.80d  0.016 y
# IO & Wait Time:                 22628s     377.14m     6.29h    0.26d  0.001 y
# Average job time:                7275s     121.24m     2.02h    0.08d
# Longest finished job:           15957s     265.95m     4.43h    0.18d
# Submission to last job:         16473s     274.55m     4.58h    0.19d
    #	performed a para recover on the jobList and used the kki kluster
    #	to run the last three jobs:
# Completed: 3 of 3 jobs
# CPU time in finished jobs:      50762s     846.03m    14.10h    0.59d  0.002 y
# IO & Wait Time:                  1795s      29.92m     0.50h    0.02d  0.000 y
# Average job time:               17519s     291.98m     4.87h    0.20d
# Longest finished job:           17887s     298.12m     4.97h    0.21d
# Submission to last job:         17887s     298.12m     4.97h    0.21d

    # put the split maf results back together into single chroms
    ssh kkstore06
    cd /cluster/data/mm9/bed/multiz30way
    # here is where the result directory maf should have already been maf.split
    mv maf maf.split
    mkdir maf
    # going to sort out the redundant header garbage to leave a cleaner maf
    for C in `ls maf.split | sed -e "s#__.*##" | sort -u`
do
    echo ${C}
    head -q -n 1 maf.split/${C}__*.maf | sort -u > maf/${C}.maf
    grep -h "^#" maf.split/${C}__*.maf | egrep -v "maf version=1|eof maf" | \
	sed -e "s#_MZ_[^ ]* # #g; s#__[0-9]##g" | sort -u >> maf/${C}.maf
    grep -h -v "^#" maf.split/${C}__*.maf >> maf/${C}.maf
    tail -q -n 1 maf.split/${C}__*.maf | sort -u >> maf/${C}.maf
done

    # load tables for a look
    ssh hgwdev
    mkdir -p /gbdb/mm9/multiz30way/maf
    ln -s /cluster/data/mm9/bed/multiz30way/maf/*.maf \
                /gbdb/mm9/multiz30way/maf
    cd /cluster/data/mm9/bed/multiz30way
    # this generates a large 1 Gb multiz30way.tab file in the directory
    #	where it is running.  Best to run this over in scratch.
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf \
	-pathPrefix=/gbdb/mm9/multiz30way/maf mm9 multiz30way
    #	real    11m38.695s
    #	Loaded 15881850 mafs in 34 files from /gbdb/mm9/multiz30way/maf

    # load summary table
    time nice -n +19 cat /gbdb/mm9/multiz30way/maf/*.maf \
	| hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
	 -maxSize=200000  multiz30waySummary stdin
    #	Created 5648546 summary blocks from 154642836 components and 15872991
    #	mafs from stdin
    #	real    19m44.355s

    # Gap Annotation
    # prepare bed files with gap info
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/multiz30way/anno
    cd /cluster/data/mm9/bed/multiz30way/anno
    mkdir maf run

    for DB in `cat ../species.list`
do
    CDIR="/cluster/data/${DB}"
    if [ ! -f ${CDIR}/${DB}.N.bed ]; then
	echo "creating ${DB}.N.bed"
	echo twoBitInfo -nBed ${CDIR}/${DB}.2bit ${CDIR}/${DB}.N.bed
    else
	ls -og ${CDIR}/${DB}.N.bed
    fi
done

    cd run
    rm -f nBeds sizes
    for DB in `grep -v mm9 ../../species.list`
do
    echo "${DB} "
    ln -s  /cluster/data/${DB}/${DB}.N.bed ${DB}.bed
    echo ${DB}.bed  >> nBeds
    ln -s  /cluster/data/${DB}/chrom.sizes ${DB}.len
    echo ${DB}.len  >> sizes
done

    ssh kki
    cd /cluster/data/mm9/bed/multiz30way/anno/run

    cat << '_EOF_' > doAnno.csh
#!/bin/csh -ef
    set dir = /cluster/data/mm9/bed/multiz30way
    set c = $1
    cat $dir/maf/${c}.maf | \
        nice mafAddIRows -nBeds=nBeds stdin /cluster/data/mm9/mm9.2bit $2
'_EOF_'
    # << happy emacs
    chmod +x doAnno.csh

    cat << '_EOF_' > template
#LOOP
./doAnno.csh $(root1) {check out line+ /cluster/data/mm9/bed/multiz30way/anno/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	there is no 16_random maf file
    cut -f1 /cluster/data/mm9/chrom.sizes | grep -v 16_random > chrom.list
    gensub2 chrom.list single template jobList
    para create jobList
    para try
#	Crashed: 1 jobs
# CPU time in finished jobs:      18129s     302.15m     5.04h    0.21d  0.001 y
# IO & Wait Time:                 10273s     171.22m     2.85h    0.12d  0.000 y
# Average job time:                 861s      14.34m     0.24h    0.01d
# Longest finished job:            4376s      72.93m     1.22h    0.05d
    #	one job was too large for this memory:
    # job: ./doAnno.csh chr1 /cluster/data/mm9/bed/multiz30way/anno/maf/chr1.maf
    # needLargeMem: Out of memory - request size 1129396 bytes, errno: 12
    #	going to hgwdev for this one:
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/anno/run
    time ./doAnno.csh chr1 ../maf/chr1.maf
    #	real    17m34.550s

    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/anno
    mkdir -p /gbdb/mm9/multiz30way/anno/maf
    ln -s /cluster/data/mm9/bed/multiz30way/anno/maf/*.maf \
                /gbdb/mm9/multiz30way/anno/maf
    #	by loading this into the table multiz30way, it will replace the
    #	previously loaded table with the unannotated mafs
    #	huge temp files are made, do them on local disk
    cd /scratch/tmp
    time nice -n +19 hgLoadMaf -pathPrefix=/gbdb/mm9/multiz30way/anno/maf \
                mm9 multiz30way
    #	Loaded 16799995 mafs in 34 files from /gbdb/mm9/multiz30way/anno/maf
    #	real    18m12.171s

    #	This step may be useless.  The original mafs should have the same
    #	summary.
    cat /cluster/data/mm9/chrom.sizes | \
	awk '{if ($2 > 1000000) { print $1 }}' |
	while read C
do
    echo /gbdb/mm9/multiz30way/anno/maf/$C.maf
done | xargs cat | \
        hgLoadMafSummary mm9 -minSize=30000 -mergeGap=1500 \
            -maxSize=200000  multiz30waySummary stdin
    #	Created 5648546 summary blocks from 154642836 components and 16790208
    #	mafs from stdin
    #	by loading this into the table multiz30waySummary, it will replace
    #	the previously loaded table with the unannotated mafs
    #	real    30m26.542s

#############################################################################
## Annotate 30-way multiple alignment with gene annotations
##		(DONE - 2007-10-18 - Hiram)
    # Gene frames
    ## survey all genomes to see what type of gene track to use
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/multiz30way/frames
    cd /cluster/data/mm9/bed/multiz30way/frames
    #	dbs: eriEur1, cavPor2, sorAra1 do not exist, can not look at them
    cat << '_EOF_' > showGenes.csh
#!/bin/csh -fe
foreach db (`egrep -v "sorAra1|eriEur1|cavPor2"  ../species.list`)
    echo -n "${db}: "
    echo -n "Tables: "
    set tables = `hgsql $db -N -e "show tables like '%Gene%'"`
    foreach table ($tables)
	if ($table == "ensGene" || $table == "refGene" || $table == "mgcGenes" || \
	    $table == "knownGene") then
		set count = `hgsql $db -N -e "select count(*) from $table"`
		echo -n "${table}: ${count}, "
	endif
    end
    set orgName = `hgsql hgcentraltest -N -e \
	    "select scientificName from dbDb where name='$db'"`
    set orgId = `hgsql mm9 -N -e \
	    "select id from organism where name='$orgName'"`
    if ($orgId == "") then
	echo "Mrnas: 0"
    else
	set count = `hgsql mm9 -N -e "select count(*) from gbCdnaInfo where organism=$orgId"`
	echo "Mrnas: ${count}"
    endif
end
'_EOF_'
    # << happy emacs
    chmod +x ./showGenes.csh
    #	given this output, manually sorted for this display:
# hg18: Tables: ensGene: 43569, knownGene: 56722, mgcGenes: 29028, refGene: 25902, Mrnas: 208990
# mm9: Tables: knownGene: 49409, mgcGenes: 22947, refGene: 21004, Mrnas: 5092390
# rn4: Tables: ensGene: 33745, knownGene: 8202, mgcGenes: 5400, refGene: 14333, Mrnas: 34471
# canFam2: Tables: ensGene: 25568, refGene: 833, Mrnas: 1708
# danRer5: Tables: ensGene: 31740, mgcGenes: 13037, refGene: 12879, Mrnas: 33184
# fr2: Tables: ensGene: 22102, Mrnas: 1098
# gasAcu1: Tables: ensGene: 28840, Mrnas: 2326
# monDom4: Tables: ensGene: 33878, refGene: 163, Mrnas: 398
# ornAna1: Tables: ensGene: 25981, refGene: 3, Mrnas: 141
# oryLat1: Tables: ensGene: 23087, Mrnas: 980
# panTro2: Tables: ensGene: 32852, refGene: 26160, Mrnas: 1277
# rheMac2: Tables: ensGene: 38561, refGene: 412, Mrnas: 3169
# bosTau3: Tables: mgcGenes: 9617, refGene: 10287, Mrnas: 26808
# equCab1: Tables: refGene: 304, Mrnas: 1396
# felCat3: Tables: refGene: 401, Mrnas: 882
# galGal3: Tables: refGene: 4210, Mrnas: 31217
# xenTro2: Tables: mgcGenes: 6255, refGene: 7086, Mrnas: 19155
# anoCar1: Tables: Mrnas: 12
# calJac1: Tables: Mrnas: 949
# dasNov1: Tables: Mrnas: 18
# echTel1: Tables: Mrnas: 0
# loxAfr1: Tables: Mrnas: 12
# oryCun1: Tables: Mrnas: 3786
# otoGar1: Tables: Mrnas: 0
# ponAbe2: Tables: Mrnas: 2
# tetNig1: Tables: Mrnas: 99495
# tupBel1: Tables: Mrnas: 47

    #	use knownGene for hg18, mm9
    #	use ensGene for rn4, canFam2, danRer5, fr2, gasAcu1, monDom4, ornAna1,
    #		oryLat1, panTro2, rheMac2
    #	use refGene for bosTau3, xenTro2
    #	use Mrnas for galGal3, tetNig1
    #	barely can use Mrnas for equCab1, felCat3, anoCar1, dasNov1,
    #	loxAfr1, oryCun1, ponAbe2
    #	no annotations for calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
    #		tupBel1

    mkdir genes
    # knownGene
    for DB in hg18 mm9
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from knownGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # ensGene
    for DB in rn4 canFam2 danRer5 fr2 gasAcu1 monDom4 \
	ornAna1 oryLat1 panTro2 rheMac2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from ensGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # refGene
    for DB in bosTau3 xenTro2
do
    hgsql -N -e "select name,chrom,strand,txStart,txEnd,cdsStart,cdsEnd,exonCount,exonStarts,exonEnds from refGene" ${DB} \
      | genePredSingleCover stdin stdout | gzip -2c \
        > /scratch/tmp/${DB}.tmp.gz
    mv /scratch/tmp/${DB}.tmp.gz genes/$DB.gp.gz
    echo "${DB} done"
done
    # and finally, using the mrna tables

    #	use Mrnas for galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1
    #	loxAfr1 oryCun1 ponAbe2
    for DB in galGal3 tetNig1 equCab1 felCat3 anoCar1 dasNov1 \
	loxAfr1 oryCun1 ponAbe2
do
tmpExt=`mktemp temp.XXXXXX`
tmpMrnaCds=${DB}.mrna-cds.${tmpExt}
tmpMrna=${DB}.mrna.${tmpExt}
tmpCds=${DB}.cds.${tmpExt}
hgsql -N -e 'select all_mrna.qName,cds.name,all_mrna.* \
	   from all_mrna,gbCdnaInfo,cds \
	   where (all_mrna.qName = gbCdnaInfo.acc) and \
	     (gbCdnaInfo.cds != 0) and (gbCdnaInfo.cds = cds.id)' \
$DB > ${tmpMrnaCds}
cut -f 1-2  ${tmpMrnaCds} > ${tmpCds}
cut -f 4-100  ${tmpMrnaCds} > ${tmpMrna}
mrnaToGene -cdsFile=${tmpCds} -smallInsertSize=8 -quiet ${tmpMrna} stdout | \
genePredSingleCover stdin stdout | gzip -2c > /scratch/tmp/$DB.tmp.gz
rm ${tmpMrnaCds} ${tmpMrna} ${tmpCds}
mv /scratch/tmp/$DB.tmp.gz genes/$DB.gp.gz
rm -f $tmpExt
echo "${DB} done"
done

    ##################################################
    # redmine GB - Feature #480 -  missing self frames on multiz 
    #   NwayFrames tables (DONE 2010-07-29) 
    #   re-run the genePredToMafFrames with mm9 genes/mm9
  
    ssh kkstore06
    cd /cluster/data/mm9/bed/multiz30way/frames
    # leaving out calJac1, echTel1, otoGar1, sorAra1, eriEur1, cavPor2
    #		tupBel1 since no gene preds there
    time (cat  ../maf/*.maf | nice -n +19 genePredToMafFrames mm9 stdin stdout mm9 genes/mm9.gp.gz rn4 genes/rn4.gp.gz hg18 genes/hg18.gp.gz rheMac2 genes/rheMac2.gp.gz ponAbe2 genes/ponAbe2.gp.gz panTro2 genes/panTro2.gp.gz equCab1 genes/equCab1.gp.gz dasNov1 genes/dasNov1.gp.gz oryCun1 genes/oryCun1.gp.gz felCat3 genes/felCat3.gp.gz canFam2 genes/canFam2.gp.gz loxAfr1 genes/loxAfr1.gp.gz bosTau3 genes/bosTau3.gp.gz monDom4 genes/monDom4.gp.gz ornAna1 genes/ornAna1.gp.gz galGal3 genes/galGal3.gp.gz anoCar1 genes/anoCar1.gp.gz xenTro2 genes/xenTro2.gp.gz gasAcu1 genes/gasAcu1.gp.gz danRer5 genes/danRer5.gp.gz tetNig1 genes/tetNig1.gp.gz fr2 genes/fr2.gp.gz oryLat1 genes/oryLat1.gp.gz | gzip > multiz30way.mafFrames.gz) > frames.log 2>&1
    # see what it looks like in terms of number of annotations per DB:
    zcat multiz30way.mafFrames.gz | cut -f4 | sort | uniq -c | sort -n
     67 loxAfr1
     79 dasNov1
    116 ponAbe2
    491 anoCar1
   1807 tetNig1
   2429 felCat3
   4892 equCab1
   9156 oryCun1
  85568 bosTau3
 118192 galGal3
 129442 xenTro2
 185607 mm9
 208239 rn4
 224420 rheMac2
 226866 panTro2
 228563 hg18
 243074 canFam2
 329523 danRer5
 334418 ornAna1
 347708 oryLat1
 369267 monDom4
 374016 gasAcu1
 380839 fr2

    #	load the resulting file
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/frames
    time nice -n +19 hgLoadMafFrames mm9 multiz30wayFrames \
	multiz30way.mafFrames.gz
    #	real     0m43.134s

    #	enable the trackDb entries:
# frames multiz30wayFrames
# irows on

#############################################################################
# phastCons 30-way (DONE - 2007-10-16 - Hiram)

    # split 30way mafs into 10M chunks and generate sufficient statistics 
    # files for # phastCons
    ssh kki
    mkdir /cluster/data/mm9/bed/multiz30way/msa.split
    cd /cluster/data/mm9/bed/multiz30way/msa.split
    mkdir -p /san/sanvol1/scratch/mm9/multiz30way/cons/ss

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
    set MAFS = /cluster/data/mm9/bed/multiz30way/maf
    set WINDOWS = /san/sanvol1/scratch/mm9/multiz30way/cons/ss
    pushd $WINDOWS
    set c = $1
    rm -fr $c
    mkdir $c
    twoBitToFa -seq=$c /scratch/data/mm9/mm9.2bit /scratch/tmp/mm9.$c.fa
    # need to truncate odd-ball scaffold/chrom names that include dots
    # as phastCons utils can't handle them
    set CLEAN_MAF = /scratch/tmp/$c.clean.maf.$$
    perl -wpe 's/^s ([^.]+\.[^. ]+)\.\S+/s $1/' $MAFS/$c.maf > $CLEAN_MAF
    /cluster/bin/phast/$MACHTYPE/msa_split $CLEAN_MAF -i MAF \
        -M /scratch/tmp/mm9.$c.fa \
        -o SS -r $c/$c -w 10000000,0 -I 1000 -B 5000
    rm -f $CLEAN_MAF /scratch/tmp/mm9.$c.fa
    popd
    date >> $c.done
'_EOF_'
    # << happy emacs
    chmod +x doSplit.csh

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(root1) {check out line+ $(root1).done}
#ENDLOOP
'_EOF_'
    # << happy emacs

    #	do the easy ones first to see some immediate results
    ls -1S -r ../maf | sed -e "s/.maf//" > maf.list

    gensub2 maf.list single template jobList
    para create jobList
    para try ... check ... etc
-
    # completed shorter jobs in a few hours, there is a problem of swapping
    # going on here, two of these jobs on a single node can consume all of its
    # memory and then some.  Three jobs failed to complete, finish them up
    # manually on hgwdev, the processes grow to over 8 Gb in memory for chr1,
    # chr11 and chr2

    # Estimate phastCons parameters

    time nice -n +19 /cluster/bin/phast.2007-05-04/phyloFit -i SS \
/san/sanvol1/scratch/mm9/multiz30way/cons/ss/chrY/chrY.1-10000000.ss \
--tree "(((((((((((mm9,rn4),cavPor2),oryCun1),((((((hg18,panTro2),ponAbe2),rheMac2),calJac1),otoGar1),tupBel1)),((sorAra1,eriEur1),(((canFam2,felCat3),equCab1),bosTau3))),(dasNov1,(loxAfr1,echTel1))),monDom4),ornAna1),(galGal3,anoCar1)),xenTro2),(((tetNig1,fr2),(gasAcu1,oryLat1)),danRer5))" \
    --out-root starting-tree
    #	real    107m46.703s
    #	Tried this on chr13 too:
    #	real    4619m42.984s
    #	that is almost 77 hours on hgwdev == 3.2 days

    # add up the C and G:
    grep BACKGROUND starting-tree.mod | awk '{printf "%0.3f\n", $3 + $4;}'
    #	0.400
    #	This 0.400 is used in the --gc argument below
    #	got 0.404 with chrM.starting-tree.mod

    # Run phastCons
    #	This job is I/O intensive in its output files, thus it is all
    #	working over in /scratch/tmp/
    ssh pk
    mkdir -p /cluster/data/mm9/bed/multiz30way/cons/run.cons
    cd /cluster/data/mm9/bed/multiz30way/cons/run.cons

    #	there are going to be several different phastCons runs using
    #	this same script.  They trigger off of the current working directory
    #	$cwd:t which is the "grp" in this script.  It is one of:
    #	all glires placentals

    cat << '_EOF_' > doPhast.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.2007-05-04
set c = $1
set f = $2
set len = $3
set cov = $4
set rho = $5
set grp = $cwd:t
set tmp = /scratch/tmp/$f
set cons = /cluster/data/mm9/bed/multiz30way/cons
mkdir -p $tmp
set san = /san/sanvol1/scratch/mm9/multiz30way/cons
if (-s $cons/$grp/$grp.non-inf) then
  cp -p $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf .
  cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $cons/$grp/$grp.non-inf $tmp
else
  cp -p $cons/$grp/$grp.mod .
  cp -p $san/ss/$c/$f.ss $cons/$grp/$grp.mod $tmp
endif
pushd $tmp > /dev/null
if (-s $grp.non-inf) then
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --not-informative `cat $grp.non-inf` \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
else
  $PHASTBIN/phastCons $f.ss $grp.mod \
    --rho $rho --expected-length $len --target-coverage $cov --quiet \
    --seqname $c --idpref $c --most-conserved $f.bed --score > $f.pp
endif
popd > /dev/null
mkdir -p $san/$grp/pp/$c $san/$grp/bed/$c
sleep 4
touch $san/$grp/pp/$c $san/$grp/bed/$c
rm -f $san/$grp/pp/$c/$f.pp
rm -f $san/$grp/bed/$c/$f.bed
mv $tmp/$f.pp $san/$grp/pp/$c
mv $tmp/$f.bed $san/$grp/bed/$c
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod a+x doPhast.csh

    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs

    # Create parasol batch and run it
    pushd /san/sanvol1/scratch/mm9/multiz30way/cons
    ls -1 ss/chr*/chr*.ss | sed 's/.ss$//' > \
        /cluster/data/mm9/bed/multiz30way/cons/run.cons/ss.list
    popd

    # run for all species
    cd ..
    mkdir -p all run.cons/all
    cd all
    cp ../../chrY.starting-tree.mod all.mod

    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/all/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
# Completed: 294 of 294 jobs
# CPU time in finished jobs:      25724s     428.73m     7.15h    0.30d  0.001 y
# IO & Wait Time:                  8951s     149.19m     2.49h    0.10d  0.000 y
# Average job time:                 118s       1.97m     0.03h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             226s       3.77m     0.06h    0.00d
# Submission to last job:           582s       9.70m     0.16h    0.01d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/all

    # load into database
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/all
    time nice -n +19 hgLoadBed mm9 phastConsElements30way mostConserved.bed
    #	Loaded 2782368 elements of size 5
    #	real    1m15.673s
    # compare with previous tracks
    hgsql mm9 -s -N -e "select count(*) from phastConsElements30way"
    #	2782368
    hgsql mm8 -s -N -e "select count(*) from phastConsElements17way"
    #	1883370

    # Try for 5% overall cov, and 70% CDS cov 
    #	--rho .31 --expected-length 45 --target-coverage .3
    #	chrY mod tree
    featureBits mm9 -enrichment refGene:cds phastConsElements30way
    #	refGene:cds 1.167%, phastConsElements30way 4.789%,
    #	both 0.582%, cover 49.90%, enrich 10.42x
    featureBits mm9 -enrichment knownGene:cds phastConsElements30way
    #	knownGene:cds 1.278%, phastConsElements30way 4.789%,
    #	both 0.627%, cover 49.03%, enrich 10.24x
    #	--rho .31 --expected-length 45 --target-coverage .3 elim non-autho
    #	chr13 mod tree
    featureBits mm9 -enrichment refGene:cds mostConserved.bed
    #	refGene:cds 1.167%, mostConserved.bed 4.128%,
    #	both 0.614%, cover 52.59%, enrich 12.74x
    #	--rho .31 --expected-length 45 --target-coverage .3 elim non-autho
    #	28-way mod tree adjusted to 30-way
    featureBits mm9 -enrichment refGene:cds mostConserved.bed
    #	refGene:cds 1.167%, mostConserved.bed 5.841%, both 0.862%, cover
    #	73.90%, enrich 12.65x

    featureBits mm8 -enrichment refGene:cds phastConsElements17way
    #	refGene:cds 1.188%, phastConsElements17way 5.398%,
    #	both 0.832%, cover 70.05%, enrich 12.98x
    featureBits mm8 -enrichment knownGene:cds phastConsElements17way
    #	knownGene:cds 1.109%, phastConsElements17way 5.398%,
    #	both 0.776%, cover 69.99%, enrich 12.97x

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
    cat << '_EOF_' > gzipAscii.sh
#!/bin/sh

TOP=`pwd`
export TOP

mkdir -p phastCons30wayScores

for D in pp/chr*
do
    C=${D/pp\/}
    out=phastCons30wayScores/${C}.data.gz
    echo "${D} > ${C}.data.gz"
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | \
	gzip > ${out}
done
'_EOF_'
    #	<< happy emacs
    chmod +x gzipAscii.sh
    time nice -n +19 ./gzipAscii.sh

    # Create merged posterier probability file and wiggle track data files
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/all
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    for D in pp/chr*
do
    ls $D/*.pp | sort -n -t\. -k2
done | xargs cat \
	| wigEncode -noOverlap stdin phastCons30way.wig phastCons30way.wib
# Converted stdin, upper limit 1.00, lower limit 0.00

    # Load gbdb and database with wiggle.
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/all
    ln -s `pwd`/phastCons30way.wib /gbdb/mm9/multiz30way/phastCons30way.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phastCons30way phastCons30way.wig
    #	real    0m42.728s

    #  Create histogram to get an overview of all the data
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm9 phastCons30way > histogram.data 2>&1
    #	real    28m24.388s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30way track"
set xlabel " phastCons30way score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ### Create a phastCons data set for Euarchontoglires

    # setup euarchontoglires-only run
    ssh pk
    cd /cluster/data/mm9/bed/multiz30way/cons
    mkdir euarchontoglires run.cons/euarchontoglires
    cd euarchontoglires
    # euarchontoglires-only: exclude all but these for phastCons tree:
    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
	--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1 \
	> euarchontoglires.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1,monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
        > euarchontoglires.non-inf

    cd ../run.cons/euarchontoglires
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
    #	Three of these jobs fail to produce any output:
    #	chr5_random/chr5_random.1-357350.bed
    #	chr7_random/chr7_random.1-362490.bed
    #	chrY_random/chrY_random.50000001-58682461.bed
# Completed: 291 of 294 jobs
# Crashed: 3 jobs
# CPU time in finished jobs:      17184s     286.40m     4.77h    0.20d  0.001 y
# IO & Wait Time:                 30139s     502.31m     8.37h    0.35d  0.001 y
# Average job time:                 163s       2.71m     0.05h    0.00d
# Longest finished job:             296s       4.93m     0.08h    0.00d
# Submission to last job:          2775s      46.25m     0.77h    0.03d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires

    # load into database
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
    time nice -n +19 hgLoadBed mm9 phastConsElements30wayEuarch \
	mostConserved.bed
    #	Loaded 1021674 elements of size 5
    #	real    0m23.402s
    # verify coverage
    featureBits mm9 phastConsElements30wayEuarch
    #	103492546 bases of 2620346127 (3.950%) in intersection

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
    mkdir downloads
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.euarchontoglires.pp.data.gz
    echo $D $C done
done

    # Create merged posterier probability file and wiggle track data files
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires
    ls downloads/chr*.data.gz | xargs zcat \
 | wigEncode -noOverlap stdin phastCons30wayEuarch.wig phastCons30wayEuarch.wib
# Converted stdin, upper limit 1.00, lower limit 0.00

    ## load table with wiggle data
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/euarchontoglires
    cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/*.wi? .
    ln -s `pwd`/phastCons30wayEuarch.wib \
	/gbdb/mm9/multiz30way/phastCons30wayEuarch.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phastCons30wayEuarch phastCons30wayEuarch.wig
    #	real    0m44.161s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm9 phastCons30wayEuarch > histogram.data 2>&1
    #	real    3m22.364s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color \
        x000000 xffffff xc000ff x66ff66 xffff00 x00ffff xff0000
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30wayEuarch track"
set xlabel " phastCons30wayEuarch score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ### Create a phastCons data set for Placentals
    # setup placental-only run
    ssh pk
    cd /cluster/data/mm9/bed/multiz30way/cons
    mkdir placental run.cons/placental
    cd placental
    # placental-only: exclude all but these for phastCons tree:
    /cluster/bin/phast.new/tree_doctor ../../mm9.30way.mod \
	--prune-all-but=mm9,rn4,cavPor2,oryCun1,hg18,panTro2,ponAbe2,rheMac2,calJac1,otoGar1,tupBel1,sorAra1,eriEur1,canFam2,felCat3,equCab1,bosTau3,dasNov1,loxAfr1,echTel1 \
	> placental.mod
    #	and place the removed ones in the non-inf file so phastCons will
    #	truly ignore them:
    echo "monDom4,ornAna1,galGal3,anoCar1,xenTro2,tetNig1,fr2,gasAcu1,oryLat1,danRer5" \
        > placental.non-inf

    cd ../run.cons/placental
    #	root1 == chrom name, file1 == ss file name without .ss suffix
    # Create template file for "all" run
    cat << '_EOF_' > template
#LOOP
../doPhast.csh $(root1) $(file1) 45 .3 .31 {check out line+ /san/sanvol1/scratch/mm9/multiz30way/cons/placental/bed/$(root1)/$(file1).bed}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 ../ss.list single template jobList
    para create jobList
    para try ... check ... push ... etc.
    #	One of these jobs fails to produce any output:
    #	chr5_random/chr5_random.1-357350.bed
# Completed: 293 of 294 jobs
# Crashed: 1 jobs
# CPU time in finished jobs:      21121s     352.01m     5.87h    0.24d  0.001 y
# IO & Wait Time:                 33985s     566.42m     9.44h    0.39d  0.001 y
# Average job time:                 188s       3.13m     0.05h    0.00d
# Longest finished job:             324s       5.40m     0.09h    0.00d
# Submission to last job:          3511s      58.52m     0.98h    0.04d

    # create Most Conserved track
    ssh kolossus
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
    cat bed/*/chr*.bed | sort -k1,1 -k2,2n | \
        awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' | \
            /cluster/bin/scripts/lodToBedScore /dev/stdin > mostConserved.bed
    #	~ 1 minute
    cp -p mostConserved.bed /cluster/data/mm9/bed/multiz30way/cons/placental

    # load into database
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/placental
    time nice -n +19 hgLoadBed mm9 phastConsElements30wayPlacental \
	mostConserved.bed
    #	Loaded 1990870 elements of size 5
    #	real    0m48.084s
    # verify coverage
    featureBits mm9 phastConsElements30wayPlacental
    #	111626429 bases of 2620346127 (4.260%) in intersection

    #	Create the downloads .pp files, from which the phastCons wiggle data
    #	is calculated
    # currently doesn't matter where this is performed, the san is the same
    # network distance from all machines.
    # sort by chromName, chromStart so that items are in numerical order 
    #  for wigEncode
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
    mkdir downloads
    for D in pp/chr*
do
    C=${D/pp\//}
    ls $D/*.pp | sort -n -t\. -k2 | xargs cat | gzip -c \
	> downloads/${C}.placental.pp.data.gz
    echo $D $C done
done

    # Create merged posterier probability file and wiggle track data files
    cd /san/sanvol1/scratch/mm9/multiz30way/cons/placental
    ls downloads/chr*.data.gz | xargs zcat \
 | wigEncode -noOverlap stdin phastCons30wayPlacental.wig \
	phastCons30wayPlacental.wib
# Converted stdin, upper limit 1.00, lower limit 0.00

    ## load table with wiggle data
    ssh hgwdev
    cd /cluster/data/mm9/bed/multiz30way/cons/placental
    cp -p /san/sanvol1/scratch/mm9/multiz30way/cons/placental/*.wi? .
    ln -s `pwd`/phastCons30wayPlacental.wib \
	/gbdb/mm9/multiz30way/phastCons30wayPlacental.wib
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phastCons30wayPlacental phastCons30wayPlacental.wig
    #	real    0m44.585s

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.001 -hBinCount=1000 -hMinVal=0.0 -verbose=2 \
	    -db=mm9 phastCons30wayPlacental > histogram.data 2>&1
    #	real    28m24.388s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Mouse Mm9 Histogram phastCons30wayPlacental track"
set xlabel " phastCons30wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.02]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
## Downloads for 30way Conservation (DONE - 2007-11-01 - Hiram)
    ssh kkstore06
    mkdir /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
    cd /cluster/data/mm9/bed/multiz30way/downloads/phastCons30wayScores
    mkdir placental euarchontoglires all
    cd all
    cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/all/phastCons30wayScores/*.data.gz .
    cd ../placental
    cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/placental/downloads/*.data.gz .
    cd ../euarchontoglires
    cp -p \
/san/sanvol1/scratch/mm9/multiz30way/cons/euarchontoglires/downloads/*.data.gz .

    #	rebuilt 2007-12-27 to fix difficulty in mafFrags when species.lst
    #	did not have mm9 as the first one
    # upstream mafs (mafFrags takes a while)
    ssh kkstore06
    cd /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
    # bash script
#!/bin/sh
for S in 1000 2000 5000
do
    echo "making upstream${S}.maf"
    featureBits mm9 refGene:upstream:${S} -fa=/dev/null -bed=stdout \
        | perl -wpe 's/_up[^\t]+/\t0/' | sort -k1,1 -k2,2n \
        | $HOME/kent/src/hg/ratStuff/mafFrags/mafFrags mm9 multiz30way \
                stdin stdout \
                -orgs=/cluster/data/mm9/bed/multiz30way/species.list \
        | gzip -c > upstream${S}.maf.gz
    echo "done upstream${S}.maf.gz"
done

    md5sum up*.gz >> md5sum.txt

    ssh kkstore06
    mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf_qual
    cp -p ../../../qual/maf/*.maf .
    time nice -n +19 gzip *.maf
    #	real    77m3.592s
    time nice -n +19 md5sum *.gz > md5sum.txt
    #	real    4m52.044s

    mkdir /cluster/data/mm9/bed/multiz30way/downloads/multiz30way/maf
    cp -p ../../../anno/maf/*.maf .
    time nice -n +19 gzip *.maf
    #	real    86m2.341s
    time nice -n +19 md5sum *.gz > md5sum.txt
    #	real    4m30.087s

    #	create syn.net files for downloads for those organisms which
    #	used the mafSynNet in the multiz30way
    ssh kkstore06
    cd /cluster/data/mm9/bed
    for DB in rn4 hg18 rheMac2 ponAbe2 panTro2 equCab1 canFam2 bosTau3
    do
	cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
	time nice -n +19 netFilter -syn mm9.${DB}.net.gz \
	    | gzip -c > mm9.${DB}.syn.net.gz
	ls -og mm9.${DB}.syn.net.gz
	md5sum mm9.${DB}.syn.net.gz >> md5sum.txt
    done
    for DB in calJac1 cavPor2 tupBel1 otoGar1 dasNov1 oryCun1 felCat3 \
	loxAfr1 eriEur1 sorAra1 echTel1
    do
	cd /cluster/data/mm9/bed/blastz.${DB}/axtChain
	ls -l mm9.${DB}.rbest.net.gz
	md5sum mm9.${DB}.rbest.net.gz >> md5sum.txt
	md5sum mm9.${DB}.rbest.chain.gz >> md5sum.txt
	grep rbest md5sum.txt
    done

    #	create symlinks to make everything show up
    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm9
    for DB in ?n4 ?g18 ?heMac2 ?onAbe2 ?anTro2 ?quCab1 ?anFam2 ?osTau3
do
    ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz
    ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.syn.net.gz \
	vs${DB}/
    ls -Lld vs${DB}/mm9.*.syn.net.gz
done
    for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
	?oxAfr1 ?riEur1 ?orAra1 ?chTel1
do
    ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
    ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
	vs${DB}/
    ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
    grep rbest vs${DB}/md5sum.txt
done
    for DB in ?alJac1 ?avPor2 ?upBel1 ?toGar1 ?asNov1 ?ryCun1 ?elCat3 \
	?oxAfr1 ?riEur1 ?orAra1 ?chTel1
do
    ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz
    ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.net.gz \
	vs${DB}/
    ls -Lld vs${DB}/mm9.${DB}.rbest.net.gz
    ls -Lld /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz
    ln -s /cluster/data/mm9/bed/blastz.${DB}/axtChain/mm9.${DB}.rbest.chain.gz \
	vs${DB}/
    ls -Lld vs${DB}/mm9.${DB}.rbest.chain.gz
    grep rbest vs${DB}/md5sum.txt
done

###########################################################################t
#
#  BUILD miRNA TRACK (DONE - 2007-10-05 - Fan)
    #   updated data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    ssh hgwdev
    cd /cluster/data/mm9/bed
    mkdir miRNA-2007-10-05
    cd miRNA-2007-10-05
    # save the miRNAtrack-mm9.txt file from email

    cat miRNAtrack-mm9.txt|sed -e 's/ /\t/g' > miRNA.tab

    hgLoadBed mm9 miRNA miRNA.tab

# Add the miRNA section to makeDb/trackDb/mouse/mm9/trackDb.ra
    vi ~/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra

# check previous release track before update
    featureBits mm8 miRNA
    #33398 bases of 2567283971 (0.001%) in intersection

    featureBits mm9 miRNA
    #39718 bases of 2620346127 (0.002%) in intersection

###########################################################################t
#  RE-BUILD miRNA TRACK (DONE  2008-05-29 - Fan)
    #   updated data from: Michel.Weber@ibcg.biotoul.fr
    #   notify them when done.
    ssh hgwdev
    cd /cluster/data/mm9/bed
    mkdir miRNA-2008-05-28
    cd miRNA-2008-05-28
    # save the mouse_miRNA_may2008.doc as mouse_miRNA_may2008.txt
    # and replace all blanks with tabs.

    cp mouse_miRNA_may2008.txt miRNA.tab
    hgLoadBed mm9 miRNA miRNA.tab

# check previous release track before update
    featureBits mm8 miRNA
    #33398 bases of 2567283971 (0.001%) in intersection

    featureBits mm9 miRNA
    #43236 bases of 2620346127 (0.002%) in intersection

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2006-08-30 markd)

    # obtained NSCAN predictions from michael brent's group
    # at WUSTL
    cd /cluster/data/mm9/bed/nscan/
    wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.gtf
    wget http://mblab.wustl.edu/predictions/mouse/mm9/mm9.prot.fa
    wget http://mblab.wustl.edu/predictions/mouse/mm9/readme.txt
    bzip2 mm9.*
    chmod a-w *

    mv ardor.wustl.edu/jeltje/mm9/chr_ptx .
    rm -rf ardor.wustl.edu
    rm chr_*/index.html*
    gzip chr_*/*
    chmod a-w chr_*/*.gz

    # load track
    ldHgGene -bin -gtf -genePredExt mm9 nscanGene mm9.gtf.bz2
    hgPepPred mm9 generic nscanPep  mm9.prot.fa.bz2
    rm *.tab

    # update trackDb; need a mm9-specific page to describe informants
    mouse/mm9/nscanGene.html   (copy from hg18 and edit)
    mouse/mm9/trackDb.ra
    # changed search regex to
        termRegex chr[0-9a-zA-Z_].*\.[0-9]+\.[0-9]

#########################################################################
# CPGISLANDS (DONE - 2007-10-25 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/cpgIsland
    cd /cluster/data/mm9/bed/cpgIsland

    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    #	gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ../..
    ln -s hg3rdParty/cpgIslands/cpglh.exe .
    
    # cpglh.exe requires hard-masked (N) .fa's.  
    #	make the hard masked sequences from these soft masked sequences
    ssh kkstore06
    time for CHR in ?/chr?.fa ??/chr??.fa ?/chr?_random.fa ??/chr??_random.fa
    do
	echo "maskOutFa ${CHR} hard ${CHR}.masked"
	nice -n +19 maskOutFa ${CHR} hard ${CHR}.masked
    done
    #	about 2 minutes

    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    cd /cluster/data/mm9/bed/cpgIsland
    time for F in ../../*/chr*.fa.masked
    do
	FA=${F/*\/}
	C=${FA/.fa.masked/}
	echo "./cpglh.exe ${FA} > ${C}.cpg"
	nice -n +19 ./cpglh.exe ${F} > ${C}.cpg
    done > cpglh.out 2>&1 &
    #	about 3 minutes

    #	Several chroms have 0 results:
    #	-rw-rw-r--  1     0 Oct 25 11:11 chr16_random.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:12 chr3_random.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:12 chr5_random.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:13 chr7_random.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:13 chrM.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:13 chrX_random.cpg
    #	-rw-rw-r--  1     0 Oct 25 11:13 chrY.cpg

    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    #	<< happy emacs
    awk -f filter.awk chr*.cpg | sort -k1,1 -k2,2n > cpgIsland.bed

    ssh hgwdev
    cd /cluster/data/mm9/bed/cpgIsland
    hgLoadBed mm9 cpgIslandExt -tab \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed
    #	Reading cpgIsland.bed
    #	Loaded 15963 elements of size 10
    featureBits mm9 cpgIslandExt
    #	10496250 bases of 2620346127 (0.401%) in intersection
    featureBits mm8 cpgIslandExt
    #	10456823 bases of 2567283971 (0.407%) in intersection
    featureBits mm7 cpgIslandExt
    #	10439328 bases of 2583394090 (0.404%) in intersection
    featureBits mm6 cpgIslandExt
    #	10432360 bases of 2597150411 (0.402%) in intersection
    featureBits mm5 cpgIslandExt
    #	10422989 bases of 2615483787 (0.399%) in intersection
    featureBits mm4 cpgIsland
    #	11109692 bases of 2627444668 (0.423%) in intersection
    featureBits mm3 cpgIsland
    #	10102968 bases of 2505900260 (0.403%) in intersection

#############################################################################
# LIFTOVER (DROPUNDER) TO MM8 (DONE - 2007-11-05 - Hiram)
    ssh kkstore06
    screen	# use a screen to control this job
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug mm9 mm8 \
      -ooc /san/sanvol1/scratch/mm9/11.ooc
    # Real run:
    cd /cluster/data/mm9/bed/blat.mm8.2007-11-05
    time nice -n +19 doSameSpeciesLiftOver.pl mm9 mm8 \
      -ooc /san/sanvol1/scratch/mm9/11.ooc > do.log 2>&1 &

########################################################################
# ANNOTATE 30-WAY ALIGNMENT WITH QUALITY DATA (2007-11-07 rico at bx.psu.edu)
#
# The basic idea here is to create a qac file which has quality data for each
# (chromosome/scaffold/etc) and then index the qac file.  Once this is done,
# mafAddQRows can be used to add the quality data to a given maf.  The agp
# files are used so that gaps can be represented in the qac files as a special
# value.

    ## create .qac and .qdx files for each species in the 30-way alignment
    ## results are stored in /cluster/store12/rico/quality
    o human (hg18)
        Unable to find quality data.

    o chimp (panTro2)
        in.agp = cat /cluster/data/panTro2/wustl/*.agp > all.agp
        in.qac = /cluster/data/panTro2/bed/quality/qac/panTro2.qac
        qacAddGapIdx in.agp in.qac panTro2.qac panTro2.qdx

    o rhesus (rheMac2)
        in.agp: /cluster/data/rheMac2/downloads
            (cat v1.edit4.chrome.ctgs.final.fix.agp; sed -e 's/^ChrUr/chrUr/' v1.edit4.ChrUr.ctgs.agp ) > all.agp
        in.qa = /cluster/data/rheMac2/qual/rheMac2.qual.qv.gz
        qaAgpToQacIdx in.agp in.qa rheMac2.qac rheMac2.qdx

    o bushbaby (otoGar1)
        http://www.broad.mit.edu/ftp/pub/assemblies/mammals/bushbaby/otoGar1
        in.agp = assembly.agp
        in.qa = Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa otoGar1.qac otoGar1.qdx

    o treeshrew (tupBel1)
        http://www.broad.mit.edu/ftp/pub/assemblies/mammals/treeShrew/tupBel1
        in.agp = assembly.agp
        in.qa = Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa tupBel1.qac tupBel1.qdx

    o rat (rn4)
        in.agp: /cluster/data/rn4
            #!/bin/sh

            rm -f rn4.agp

            for chrom in `awk '{print $1}' chrom.sizes`
            do
                num=`echo $chrom | cut -dr -f2- | cut -d_ -f1`
                if [ -f "$num/${chrom}.agp" ]; then
                    cat $num/${chrom}.agp >> rn4.agp
                else
                    echo "Missing agp file for $chrom"
                    exit 1
                fi
            done
        in.qa: /cluster/data/rn4/downloads
            #!/bin/sh

            rm -f rn4.qa

            for file in *.qual.gz
            do
                echo -n "Processing $file ... "
                chrom=`echo $file | sed -e 's/^Rnor3.4//;s/\.fa\.qual\.gz$//' | tr '-' '_'`
                (echo ">$chrom" ; gzip -dc $file | tail +2) >> rn4.qa
                echo "done."
            done
        qaAgpToQacIdx in.agp in.qa rn4.qac rn4.qdx

    o mouse (mm9)
        Unable to find quality data.

    o guinea pig (cavPor2)
        in.agp = /cluster/data/cavPor2/downloads/assembly.agp
        in.qa = /cluster/data/cavPor2/downloads/Draft_v2.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa cavPor2.qac cavPor2.qdx

    o rabbit (oryCun1)
        http://www.broad.mit.edu/ftp/pub/assemblies/mammals/rabbit/oryCun1
        in.agp = assembly.agp
        in.qa = Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa oryCun1.qac oryCun1.qdx

    o shrew (sorAra1)
        in.agp = /cluster/data/sorAra1/downloads/assembly.agp
        in.qa = /cluster/data/sorAra1/downloads/Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa sorAra1.qac sorAra1.qdx

    o hedgehog (eriEur1)
        in.agp = /cluster/data/eriEur1/downloads/assembly.agp
        in.qa = /cluster/data/eriEur1/downloads/Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa eriEur1.qac eriEur1.qdx

    o dog (canFam2)
        in.agp = /cluster/store9/canFam2/broad/UCSC_Dog2.0.agp
        in.qac = /cluster/store9/canFam2/bed/quality/chrom.qac
        qacAddGapIdx in.agp in.qac canFam2.qac canFam2.qdx

    o cat (felCat3)
        in.agp = /cluster/data/felCat3/downloads/assembly.agp
        in.qa = /cluster/data/felCat3/downloads/Draft_v3.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa felCat3.qac felCat3.qdx

    o horse (equCab1)
        in.agp = /cluster/data/equCab1/downloads/assembly.agp
        in.qa = /cluster/data/equCab1/downloads/Draft_v1.agp.chromosome.qual.gz
        qaAgpToQacIdx in.agp in.qa equCab1.qac equCab1.qdx

    o cow (bosTau3)
        in.agp = /cluster/data/bosTau3/fixup/UCSC.agp
        in.qac = /cluster/data/bosTau3/fixup/chrom.qac
        qacAddGapIdx in.agp in.qac bosTau3.qac bosTau3.qdx

    o armadillo (dasNov1)
        /cluster/data/dasNov1/broad
            combineQuals assembly.agp.gz assembly.quals.gz combined.quals
            qaAgpToQacIdx assembly.agp.gz combined.quals dasNov1.qac dasNov1.qdx

    o elephant (loxAfr1)
        /cluster/data/loxAfr1/broad
            combineQuals assembly.agp assembly.quals.gz combined.quals
            qaAgpToQacIdx assembly.agp combined.quals loxAfr1.qac loxAfr1.qdx

    o tenrec (echTel1)
        /cluster/data/echTel1/broad
            combineQuals assembly.agp assembly.quals.gz combined.quals
            qaAgpToQacIdx assembly.agp combined.quals echTel1.qac echTel1.qdx

    o opossum (monDom4)
        /cluster/data/monDom4/broad.mit.edu
        in.qa = gzip -dc Monodelphis4.0.agp.chromosome.qual.gz \
            | sed -e 's/^>\([^.]*\)\.1-.*/>chr\1/;/^>.*Monodelphis4.0)/d' > monDom4.qa
        in.agp = Monodelphis4.0.agp
        qaAgpToQacIdx in.agp in.qa monDom4.qac monDom4.qdx

    o platypus (ornAna1)
        Unable to find quality data.

    o chicken (galGal3)
        Unable to find quality data.

    o lizard (anoCar1)
        in.agp = /cluster/data/anoCar1/downloads/assembly.agp
        in.qac = /cluster/data/anoCar1/downloads/scaffold.lifted.qac
        qacAddGapIdx in.agp in.qac anoCar1.qac anoCar1.qdx

    o frog (xenTro2)
        Unable to find quality data.

    o tetraodon (tetNig1)
        Unable to find quality data.

    o fugu (fr2)
        Unable to find quality data.

    o stickleback (gasAcu1)
        in.agp = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.agp
        in.qa = /cluster/data/gasAcu1/downloads/UCSC.gasAcu1.qual
        qaAgpToQacIdx in.agp in.qac gasAcu1.qac gasAcu1.qdx

    o medaka (oryLat1)
        in.agp = /cluster/data/oryLat1/downloads/chr.agp.txt-fixed
        in.qac = /cluster/data/oryLat1/bed/qual/fixed.chroms.qac
        qacAddGapIdx in.agp in.qac oryLat1.qac oryLat1.qdx

    o zebrafish (danRer5)
        Unable to find quality data.

    o orangutan (ponAbe2)
        Unable to find quality data.

    o marmoset (calJac1)
        Unable to find quality data.

    ## copy all .qac and .qdx files to the san
    cp *.{qac,qdx} /san/sanvol1/rico/quality

    ## create species list (species.lst) containing the following
    anoCar1 /san/sanvol1/rico/quality
    bosTau3 /san/sanvol1/rico/quality
    canFam2 /san/sanvol1/rico/quality
    cavPor2 /san/sanvol1/rico/quality
    dasNov1 /san/sanvol1/rico/quality
    echTel1 /san/sanvol1/rico/quality
    equCab1 /san/sanvol1/rico/quality
    eriEur1 /san/sanvol1/rico/quality
    felCat3 /san/sanvol1/rico/quality
    gasAcu1 /san/sanvol1/rico/quality
    loxAfr1 /san/sanvol1/rico/quality
    monDom4 /san/sanvol1/rico/quality
    oryCun1 /san/sanvol1/rico/quality
    oryLat1 /san/sanvol1/rico/quality
    otoGar1 /san/sanvol1/rico/quality
    panTro2 /san/sanvol1/rico/quality
    rheMac2 /san/sanvol1/rico/quality
    rn4     /san/sanvol1/rico/quality
    sorAra1 /san/sanvol1/rico/quality
    tupBel1 /san/sanvol1/rico/quality

    ## the following script will add quality data to each of the mafs
cat > addQData << 'EOF'
#!/bin/sh

INPUT_DIR=/cluster/data/mm9/bed/multiz30way/anno/maf
OUTPUT_DIR=/cluster/data/mm9/bed/multiz30way/qual/maf

for maf in `ls -1Sr ${INPUT_DIR}/*.maf`
do
    file=`basename $maf`

    mafAddQRows species.lst $maf ${OUTPUT_DIR}/$file
done
'EOF'

# << emacs
#########################################################################
### IGTC (Int'l GeneTrap Consortium) (DONE - 2007-10-01 - angie)
### Doug Stryke <stryke@cgl.ucsf.edu> in Tom Ferrin's lab

### NOTE -- the igtc track is automatically updated on hgwdev by the
### scripts monthlyUpdateIgtc.csh and updateIgtc.pl in
### kent/src/hg/utils/automation/ .

#########################################################################
# Load CCDS (2007-12-12 markd)
    # import ccds database as described in ccds.txt
    set db=mm9
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    # build initial version of ccdsMgcMap table, updated by nightly genbank update
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene mgcGenes ccdsMgcMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner

############################################################################
# Reload CCDS (2008-02-01 markd)
    # import ccds database as described in ccds.txt
    set db=mm9
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs


############################################################################
# dbSNP BUILD 128 (DONE 2/8/08 angie)
# updated snp128ExceptionDesc (tweaked wording) 3/11/08
    # Set up build directory
    ssh kkstore06
    mkdir -p /cluster/store3/dbSNP128/{mouse,shared}

    # dbSNP 128 field encodings (*.bcp.gz) were already downloaded -- 
    # see hg18.txt.  

    ########################## DOWNLOAD #############################
    cd /cluster/data/dbSNP/128/mouse
    mkdir data schema rs_fasta
    # Get data from NCBI (anonymous FTP)
    wget ftp://ftp.ncbi.nih.gov/snp/00readme.txt
    cd /cluster/data/dbSNP/128/mouse/data
    alias wg wget --timestamping
    set ftpSnpDb = ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/database
    # ContigLoc table has coords, orientation, loc_type, and refNCBI allele
    wg $ftpSnpDb/organism_data/b128_SNPContigLoc_37_1.bcp.gz
    wg $ftpSnpDb/organism_data/b128_SNPContigLocusId_37_1.bcp.gz
    wg $ftpSnpDb/organism_data/b128_ContigInfo_37_1.bcp.gz
    # MapInfo has alignment weights
    wg $ftpSnpDb/organism_data/b128_SNPMapInfo_37_1.bcp.gz
    # SNP has univar_id, validation status and heterozygosity
    wg $ftpSnpDb/organism_data/SNP.bcp.gz

    # Get schema
    cd /cluster/data/dbSNP/128/mouse/schema
    wg $ftpSnpDb/organism_schema/mouse_10090_table.sql.gz

    # Get fasta files
    # using headers of fasta files for molType, class, observed
    cd /cluster/data/dbSNP/128/mouse/rs_fasta
    wg ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/rs_fasta/\*.gz


    ########################## LOAD NCBI TABLES #############################
    # Simplify names of data files -- strip version & extras to get
    # local canonical table names.
    cd /cluster/data/dbSNP/128/mouse/data
    foreach f (*.bcp.gz)
      set new = `echo $f \
                 | sed -e 's/^b128_SNP//; s/^b128_//; s/_37_1//; s/.bcp//;'`
      mv $f $new
      echo $new
    end

    # Extract just the tables that we need from the NCBI msSQL table
    # creation file, and get CREATE statements from
    # mouse_10090_table.sql for our 5 tables
    cd /cluster/data/dbSNP/128/mouse/schema

    zcat mouse_10090_table.sql.gz \
    | perl -we '$/ = "\nGO\n\n\n"; \
        while (<>) { \
          next unless /^CREATE TABLE \[(b128_(SNP)?)?(ContigInfo|ContigLoc|ContigLocusId|MapInfo|SNP)(_37_1)?\]/; \
          s/b128_(SNP)?//; s/_37_1//; \
          s/[\[\]]//g;  s/GO\n\n\n/;/;  s/smalldatetime/datetime/g; \
          s/ON PRIMARY//g;  s/COLLATE//g;  s/Latin1_General_BIN//g; \
          s/IDENTITY (1, 1) NOT NULL /NOT NULL AUTO_INCREMENT, PRIMARY KEY (id)/g; \
          s/nvarchar/varchar/g;  s/set quoted/--set quoted/g; \
          s/(image|varchar\s+\(\d+\))/BLOB/g; \
          print; \
        }' \
      > table.sql

    # load on kolossus or a small cluster machine (mysql5 is OK for this).
    ssh kkr3u00
    hgsql '' -e 'create database mm9snp128' 
    cd /cluster/data/dbSNP/128/mouse/schema
    hgsql mm9snp128 < table.sql
    cd ../data

    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP)
      zcat $t.gz \
      | perl -wpe 's/(\d\d:\d\d:\d\d)\.0/$1/g;' \
      | hgLoadSqlTab -oldTable mm9snp128 $t placeholder stdin
    end
    # There were some warnings (many cleared up by the perl substitution)
    # but no rows were dropped.  I eyeballed a few examples, seemed OK,
    # e.g. no value given for a field where NULL is OK.
    foreach t (ContigInfo ContigLoc ContigLocusId MapInfo SNP) 
     echo -n "${t}:\t"
      hgsql -N -B mm9snp128 -e 'select count(*) from '$t
    end
#ContigInfo:     13636
#ContigLoc:      31733892
#ContigLocusId:  12883378
#MapInfo:        28464204
#SNP:            14380527

    # compare contig list in mm9.ctgPos vs ContigInfo (for the reference
    # strain, not the alts included in ContigInfo)
    ssh hgwdev hgsql mm9 -NBe 'select * from ctgPos;' \
      | sed -re 's/^(N[A-Z]_[0-9]+)\.[0-9]+/\1/;' \
      > ctgPos.tab
    awk '{print $1;}' ctgPos.tab | sort > /tmp/1

    # Take a look at the group_label values and choose a set that matches
    # the reference assembly:
    hgsql mm9snp128 -NBe 'select distinct(group_label) from ContigInfo'
    # Looks like just ref_strain will do.

    hgsql mm9snp128 -NBe 'select contig_acc from ContigInfo \
                          where group_label = "C57BL/6J"' \
    | sort > /tmp/2
    diff /tmp/1 /tmp/2 
    # No diff, good.
    # Make sure there are no orient != 0 contigs among those selected.
    hgsql mm9snp128 -NBe \
      'select count(*) from ContigInfo where orient != 0 and \
         group_label = "C57BL/6J";'
#0

    #################### EXTRACT INFO FROM NCBI TABLES ####################
    mkdir -p /scratch/snp/128/mouse
    cd /scratch/snp/128/mouse

    time hgsql mm9snp128 -e \
      'alter table ContigLoc  add index (ctg_id); \
       alter table ContigInfo add index (ctg_id);'
#0.002u 0.001s 6:18.71 0.0%      0+0k 0+0io 1pf+0w

    time hgsql mm9snp128 -e \
      'alter table ContigInfo add index (group_label(9));'
#0.002u 0.002s 0:00.35 0.0%      0+0k 0+0io 1pf+0w

    # Since there is only one group_label for mouse, just use snp_id
    # as key.  If there is more than one group_label to pick up, then
    # don't use this as a template -- use hg18.txt.
    hgsql mm9snp128 -NBe \
      'select snp_id, ContigInfo.contig_acc, asn_from, asn_to, \
              loc_type, orientation, allele, phys_pos_from \
       from ContigLoc, ContigInfo \
       where ContigLoc.ctg_id = ContigInfo.ctg_id and  \
             ContigInfo.group_label = "C57BL/6J";' \
      | sort \
      > ucscContigLoc.txt
    # took ~7 minutes
    # The IDs are non-unique (can be multiply mapped).  This is OK if 
    # everything else that we relate to these uniquely maps to snp_id.
    wc -l ucscContigLoc.txt
#16232825 ucscContigLoc.txt
    awk '{print $1;}' ucscContigLoc.txt | uniq | wc -l
#14304640

    # SNP -> valid, avHet, avHetSE
    # SNP has only snp_id as identifier, nothing relating to assembly.
    hgsql mm9snp128 -NBe \
      'select snp_id, validation_status, avg_heterozygosity, het_se \
       from SNP;' \
    | sort \
      > ucscSNP.txt
    # Check ID uniqueness:
    wc -l ucscSNP.txt
#14380527 ucscSNP.txt
    awk '{print $1;}' ucscSNP.txt | uniq | wc -l
#14380527

    # ContigLocusId -> func
    # ContigLocusId has only snp_id as an identifier (it gives one 
    # example contig if the SNP is on multiple contigs).  
    # The sort options and awk are to convert multiple entries with different
    # function classes for the same SNP into one entry per SNP with a list
    # of function classes.
    hgsql mm9snp128 -NBe \
      'select snp_id, fxn_class from ContigLocusId;' \
    | sort -u -k1,1 -k2,2n  \
    | awk '{if (prevId == $1) { prevFunc = prevFunc $2 ","; } \
            else { if (prevId) {print prevId "\t" prevFunc;} \
                                prevFunc = $2 ","; }} \
           {prevId = $1;} \
           END {print prevId "\t" prevFunc;}' \
      > ucscFunc.txt
    # Check ID uniqueness:
    wc -l ucscFunc.txt
#5878591 ucscFunc.txt
    awk '{print $1;}' ucscFunc.txt | sort -u | wc -l
#5878591

    # MapInfo -> weight
    # MapInfo needs assembly+snp_ids in order to have unique IDs.
    time hgsql mm9snp128 -e \
      'alter table MapInfo add index (assembly(9));'
#0.000u 0.004s 2:22.64 0.0%      0+0k 0+0io 0pf+0w
    hgsql mm9snp128 -NBe \
      'select snp_id, weight from MapInfo where assembly = "C57BL/6J";' \
      | sort \
      > weight.txt
    # ~1 minute
    # Check ID uniqueness:
    wc -l weight.txt
#14304640 weight.txt
    awk '{print $1;}' weight.txt | uniq | wc -l
#14304640
    awk '{print $2;}' weight.txt | sort -n | uniq -c
#13954580 1
# 113119 2
# 169755 3
#  67186 10
    # SNPs w/weight 0 and 10 will be discarded later.

    # fasta headers -> observed, molType, class
    zcat /cluster/data/dbSNP/128/mouse/rs_fasta/rs_ch*.fas.gz \
    | grep '^>gnl' \
    | perl -wpe 's/^\S+rs(\d+) .*mol="(\w+)"\|class=(\d+)\|alleles="([^"]+)"\|build.*/$1\t$4\t$2\t$3/ || die "Parse error line $.:\n$_\n\t";' \
    | sort \
      > ucscGnl.txt
    # ~4 minutes
    wc -l ucscGnl.txt
#14380527 ucscGnl.txt
    awk '{print $1;}' ucscGnl.txt | uniq | wc -l
#14380527

    ############### JOIN NCBI COLUMNS TO GET UCSC SNP COLUMNS ################
    # Join files by ID.  
    time join -a 1 -e MISSING -t '	' ucscContigLoc.txt weight.txt \
      > ucscCL+w.txt
#26.811u 4.091s 1:02.59 49.3%    0+0k 0+0io 0pf+0w
    wc -l ucscCL+w.txt 
#16232825 ucscCL+w.txt
    # Same as ucscContigLoc.txt above, good.
    # Any missing weights?
    grep MISSING ucscCL+w.txt | head
    # No output, good.

    # Join the files with SNP-only IDs.
    time join -e MISSING -t '	' ucscGnl.txt ucscSNP.txt \
      > ucscG+S.txt
#16.591u 1.935s 0:28.44 65.1%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S.txt
#14380527 ucscG+S.txt
    # Same as ucscSNP.txt and ucscGnl.txt above.
    grep MISSING ucscG+S.txt | wc -l
#0
    time join -a 1 -e MISSING -o 1.1,1.2,1.3,1.4,1.5,1.6,1.7,2.2 \
      -t '	' ucscG+S.txt ucscFunc.txt \
      > ucscG+S+F.txt
#17.438u 2.115s 0:24.83 78.6%    0+0k 0+0io 0pf+0w
    wc -l ucscG+S+F.txt
#14380527 ucscG+S+F.txt
    grep MISSING ucscG+S+F.txt | wc -l 
#8501936
    # Not surprising -- ucscFunc.txt has only 5878591 lines.
    expr 14380527 - 5878591
#8501936

    # Final join -- treat ContigLoc as authoritative (since it has coords).
    # Arrange columns in same order as in the SNP table, with extras for
    # checking at the end (phys_pos_from).
    # chr chrS chrE name strand refN obs molT cls val aH aHSE fxn locT wt ...
    time join -a 1 -e MISSING -t '	' \
  -o '1.2 1.3 1.4 1.1 1.6 1.7 2.2 2.3 2.4 2.5 2.6 2.7 2.8 1.5 1.9 1.8' \
      ucscCL+w.txt ucscG+S+F.txt \
      > ucscNcbiSnp.ctg.txt
#41.401u 6.045s 1:02.04 76.4%    0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.ctg.txt
#16232825 ucscNcbiSnp.ctg.txt
    grep MISSING ucscNcbiSnp.ctg.txt | awk '{print $4;}' | uniq | wc -l
#8432812
    # a bit less than the 8501936 missing FUNC's above... perhaps some
    # of those did not have any mappings in ucscContigLoc.txt.

    # Lift the map contig coordinates to chrom coordinates (~2m);
    sed -re 's/\t(N[A-Z]_[0-9]+)\.[0-9]+\t/\t\1\t/;' \
      /cluster/data/mm9/jkStuff/mm9.contigs.lift > liftContigs.lft
    time liftUp ucscNcbiSnp.bed liftContigs.lft warn ucscNcbiSnp.ctg.txt
#131.007u 7.438s 2:26.48 94.5%   0+0k 0+0io 0pf+0w
    wc -l ucscNcbiSnp.bed
#16232825 ucscNcbiSnp.bed

    # At this point, move back from /scratch to /cluster/data.
    nice gzip ucscNcbiSnp.bed
    cp -p ucscNcbiSnp.bed.gz /cluster/data/dbSNP/128/mouse/

    # Translate NCBI's encoding into UCSC's, and perform a bunch of
    # checks.  This is where developer involvement is most likely as
    # NCBI extends the encodings used in dbSNP.
    cd /cluster/data/dbSNP/128/mouse/
    gunzip ucscNcbiSnp.bed.gz
    time snpNcbiToUcsc ucscNcbiSnp.bed /cluster/data/mm9/mm9.2bit \
      snp128
#count of snps with weight  0 = 0
#count of snps with weight  1 = 13954580
#count of snps with weight  2 = 226238
#count of snps with weight  3 = 712684
#count of snps with weight 10 = 1339323
#Found no errors.
#162.963u 9.783s 3:02.77 94.5%   0+0k 0+0io 1pf+0w
    wc -l snp*
#  14893502 snp128.bed
#        22 snp128.sql
#         0 snp128Errors.bed
#        18 snp128ExceptionDesc.tab
#   1898314 snp128Exceptions.bed

    # Make one big fasta file.  (note: snp126 skipped chrUn... but it's small
    # compared to chr1, chr2 etc.)
    # Some of the fasta files have SNPs that were not mapped to the reference
    # assembly.  Make sure there is no overlap with snp128.bed, and then
    # move then out of the way.
    zcat rs_fasta/rs_chNotOn.fas.gz \
    | perl -we 'while (<>) { \
                  next unless /^>gnl/; s/^>gnl.dbSNP.(rs\d+).*/$1/; print; }' \
    | sort | grep -Fwf - snp128.bed | head
    ^chNotOn^chAltOnly
    # No output from either command -- good.
    mkdir rs_fasta/omitted
    mv rs_fasta/rs_ch{AltOnly,NotOn}.fas.gz rs_fasta/omitted/

    zcat rs_fasta/rs_ch*.fas.gz \
    | perl -wpe 's/^>gnl\|dbSNP\|(rs\d+) .*/>$1/ || ! /^>/ || die;' \
      > snp128.fa
    # Check for duplicates.
    grep ^\>rs snp128.fa | sort > /scratch/tmp/seqHeaders
    wc -l /scratch/tmp/seqHeaders
#14304640 /scratch/tmp/seqHeaders
    uniq /scratch/tmp/seqHeaders | wc -l
#14304640
    # Use hgLoadSeq to generate .tab output for sequence file offsets,
    # and keep only the columns that we need: acc and file_offset.
    # Index it and translate to snpSeq table format.
    time hgLoadSeq -test placeholder snp128.fa
#42.866u 4.977s 0:48.09 99.4%    0+0k 0+0io 4pf+0w
    cut -f 2,6 seq.tab > snp128Seq.tab
    rm seq.tab

    ssh hgwdev
    # Load up main track tables.
    cd /cluster/data/dbSNP/128/mouse
    time nice hgLoadBed -tab -noSort -onServer -tmpDir=/scratch/tmp \
      mm9 snp128 -sqlTable=snp128.sql snp128.bed
#Loaded 14893502 elements of size 17
#67.395u 12.818s 8:43.01 15.3%   0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125Exceptions.sql \
      > snp128Exceptions.sql
    time nice hgLoadBed -tab -onServer -tmpDir=/scratch/tmp \
      mm9 snp128Exceptions -sqlTable=snp128Exceptions.sql \
      snp128Exceptions.bed
#Loaded 1898314 elements of size 5
#8.925u 1.354s 0:52.66 19.5%     0+0k 0+0io 0pf+0w
    sed -e 's/snp125/snp128/' ~/kent/src/hg/lib/snp125ExceptionDesc.sql \
      > snp128ExceptionDesc.sql
    # 3/11/08: reloaded snp128ExceptionDesc (tweaked wording)
    hgLoadSqlTab mm9 snp128ExceptionDesc snp128ExceptionDesc.sql \
      snp128ExceptionDesc.tab
    # Load up sequences.
    sed -e 's/snpSeq/snp128Seq/' ~/kent/src/hg/lib/snpSeq.sql \
      > snp128Seq.sql
    mkdir -p /gbdb/mm9/snp
    ln -s /cluster/data/dbSNP/128/mouse/snp128.fa /gbdb/mm9/snp/snp128.fa
    time nice hgLoadSqlTab mm9 snp128Seq snp128Seq.sql snp128Seq.tab
#0.000u 0.003s 3:02.66 0.0%      0+0k 0+0io 0pf+0w
    # Put in a link where one would expect to find the track build dir...
    ln -s /cluster/data/dbSNP/128/mouse /cluster/data/mm9/bed/snp128

#########################################################################
# BLASTZ/CHAIN/NET BOSTAU4 (DONE - 2008-03-11,12 - Hiram)
    ssh kkstore06
    screen # use a screen to manage this multi-day job
    mkdir /cluster/data/mm9/bed/blastzBosTau4.2008-03-11
    cd /cluster/data/mm9/bed/blastzBosTau4.2008-03-11

    cat << '_EOF_' > DEF
BLASTZ_M=50

# TARGET: Human Hg18
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau4
SEQ2_DIR=/san/sanvol1/scratch/bosTau4/bosTau4.2bit
SEQ2_LEN=/cluster/data/bosTau4/chrom.sizes
# Maximum number of scaffolds that can be lumped together
SEQ2_LIMIT=200
SEQ2_CHUNK=20000000
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzBosTau4.2008-03-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl `pwd`/DEF -verbose=2 \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -syntenicNet > do.log 2>&1 &
    #	real    460m51.297s
    cat fb.mm9.chainBosTau4Link.txt
    #	690095394 bases of 2620346127 (26.336%) in intersection

    mkdir /cluster/data/bosTau4/bed/blastz.mm9.swap
    cd /cluster/data/bosTau4/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzBosTau4.2008-03-11/DEF \
	-bigClusterHub=pk -chainMinScore=3000 -chainLinearGap=medium \
	    -swap -syntenicNet > swap.log 2>&1 &
    #	real    117m39.571s
    cat fb.bosTau4.chainMm9Link.txt
    #	707444627 bases of 2731830700 (25.896%) in intersection

#######################################################################
# BLASTZ/CHAIN/NET Lamprey petMar1 (DONE - 2008-04-14 - Hiram)
    ssh kkstore06
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzPetMar1.2008-04-14
    cd /cluster/data/mm9/bed/blastzPetMar1.2008-04-14

    cat << '_EOF_' > DEF
# Mouse vs. Lamprey

# using the "distant" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Lamprey petMar1
SEQ2_DIR=/scratch/data/petMar1/petMar1.2bit
SEQ2_LEN=/scratch/data/petMar1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=300
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzPetMar1.2008-04-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk > do.log 2>&1 &

    cat fb.mm9.chainPetMar1Link.txt
    #	29113438 bases of 2620346127 (1.111%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/petMar1/bed/blastz.mm9.swap
    cd /cluster/data/petMar1/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/mm9/bed/blastzPetMar1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=pk > swap.log 2>&1 &
    #	real    33m29.076s
    cat  fb.petMar1.chainMm9Link.txt
    #	26052507 bases of 831696438 (3.132%) in intersection

#######################################################################
# BLASTZ/CHAIN/NET Lanclet broFla1 (DONE - 2008-04-14 - Hiram)
    ssh kkstore06
    screen # use screen to control this job
    mkdir /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
    cd /cluster/data/mm9/bed/blastzBraFlo1.2008-04-14

    cat << '_EOF_' > DEF
# Mouse vs. Lanclet

# using the "distant" genome alignment parameters
#	see also: http://genomewiki.ucsc.edu/index.php/Mm9_multiple_alignment
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Lancelet braFlo1 - largest chunk big enough for largest scaffold
#       Largest scaffold 7,200,735 - 3032 scaffolds + chrM
SEQ2_DIR=/scratch/data/braFlo1/braFlo1.2bit
SEQ2_LEN=/scratch/data/braFlo1/chrom.sizes
SEQ2_CTGDIR=/scratch/data/braFlo1/braFlo1UnScaffolds.2bit
SEQ2_CTGLEN=/scratch/data/braFlo1/braFlo1UnScaffolds.sizes
SEQ2_LIFT=/scratch/data/braFlo1/braFlo1.lift
SEQ2_CHUNK=10000000
SEQ2_LIMIT=30
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=kk > do.log 2>&1 &
    #	real    408m36.691s
    cat fb.mm9.chainBraFlo1Link.txt
    #	26725980 bases of 2620346127 (1.020%) in intersection

    #	That is OK, now for the swap:
    mkdir /cluster/data/braFlo1/bed/blastz.mm9.swap
    cd /cluster/data/braFlo1/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/mm9/bed/blastzBraFlo1.2008-04-14/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust -bigClusterHub=kk > swap.log 2>&1 &
    #	real    12m23.402s
    cat  fb.braFlo1.chainMm9Link.txt
    #	31517169 bases of 923355587 (3.413%) in intersection

###########################################################################
#  LOAD Transcriptome data (DONE - 2008-05-06 - Hiram)
    # data from Christian Iseli 'Christian.Iseli at licr.org'
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/transcriptome
    cd /cluster/data/mm9/bed/transcriptome
    wget --timestamping ftp://ftp.licr.org/pub/MTr.gtf.gz
    wget --timestamping ftp://ftp.licr.org/pub/txg.tar.gz
    gtfToGenePred -genePredExt MTR.gtf.gz MTr.gp
    hgLoadGenePred mm9 transcriptome -genePredExt MTr.gp

    tar xvzf txg.tar.gz
    # Do a little data cleanup and transformation and
    #	load splice graphs into database.
    sed 's/altGraphX/sibTxGraph/' ~/kent/src/hg/lib/altGraphX.sql \
	> sibTxGraph.sql
    cat txg/*.txg | txgToAgx stdin stdout \
	| hgLoadBed -notItemRgb -sqlTable=sibTxGraph.sql mm9 sibTxGraph stdin
    #	Loaded 52065 elements of size 18

   # Create sibAltEvents track for analysed alt-splices.
   cat txg/*.txg \
	| txgAnalyze stdin /cluster/data/mm9/mm9.2bit stdout \
	| awk '$2 >= 0' | sort | uniq > sibAltEvents.bed
   hgLoadBed mm9 sibAltEvents sibAltEvents.bed

#############################################################################
# BLASTZ/CHAIN/NET equCab2 (DONE - 2008-04-17 - larrym)
    ssh kkstore04
    screen #	use screen to control this multi-day job
    mkdir /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
    cd /cluster/data/mm9/bed/blastz.equCab2.2008-04-15
    cat << '_EOF_' > DEF
# Mouse vs. Horse

BLASTZ_M=50

# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/cluster/data/mm9/chrom.sizes 
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/cluster/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/cluster/data/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastz.equCab2.2008-04-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-verbose=2 -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium \
      -blastzOutRoot /cluster/bluearc/equCab2/blastz.hg18 >>& do.log &

    ln -s blastz.equCab2.2008-04-15 /cluster/data/mm9/bed/blastz.equCab2

############################################################################
# Reload CCDS from CCDS.20080502 dump (2008-05-03 markd)
    # import ccds database as described in ccds.txt
    set db=mm9
    set ncbiBld=37.1
    # create and load ccdsGene and ccdsInfo tables from imported database
    /cluster/data/genbank/bin/x86_64/ccdsMkTables -loadDb ccds ${db} ${ncbiBld} ccdsInfo ccdsGene

    # ccdsKgMap
    /cluster/data/genbank/bin/x86_64/mkCcdsGeneMap -db=${db} -loadDb ccdsGene knownGene ccdsKgMap

    checkTableCoords ${db} -verbose=2 ccdsGene
    # update all.jointer to include ${db} in ccdsDb
    joinerCheck -database=${db} -identifier=ccdsGeneId ~/compbio/genbank/kent/src/hg/makeDb/schema/all.joiner
    # request push of 
        ccdsGene
        ccdsInfo
        ccdsKgMap
    # << emacs
############################################################################
#  update vega genes to version 31 (v49 of Ensembl genes)
#	(DONE - 2008-05-15 - Hiram)
    mkdir  /cluster/data/mm9/bed/vega31_49
    cd  /cluster/data/mm9/bed/vega31_49
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/CHANGELOG.gz"
    wget --timestamping \
	"ftp://ftp.sanger.ac.uk/pub/vega/mouse/catalog.txt"
    wget --timestamping \
"ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/Mus_musculus.VEGA.apr.pep.tot.fa.gz"

    #	processing similar to the same processing for Ensembl genes,
    #	from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
    cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
    zcat gtf_file.gz \
        | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
        | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
        | gzip > allGenes.gtf.gz

    gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
	| gzip > mm9.allGenes.gp.gz
    /cluster/home/hiram/kent/src/hg/utils/automation/extractGtf.pl \
	infoOut.txt > ensGtp.tab
    genePredCheck -db=mm9 mm9.allGenes.gp.gz
    #	checked: 54208 failed: 0
    zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
    zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
    gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
    gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp
    genePredCheck -db=mm9 pseudo.gp
    #	checked: 3989 failed: 0
    genePredCheck -db=mm9 not.pseudo.gp
    #	checked: 50219 failed: 0
    hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
    hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp

############################################################################
# BLASTZ/CHAIN/NET 2X Ground squirrel: speTri0 (In progress 2008-05-16 kate)

    ssh kkstore06
    cd /cluster/data/mm9/bed
    mkdir blastzSpeTri0.2008-05-16
    cd blastzSpeTri0.2008-05-16

    cat << '_EOF_' > DEF
# Mouse vs. Ground squirrel

BLASTZ_M=50

# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes 
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Ground squirrel speTri0
SEQ2_DIR=/scratch/data/speTri0/speTri0.2bit
SEQ2_LEN=/cluster/data/speTri0/chrom.sizes
SEQ2_CHUNK=30000000
SEQ2_LIMIT=500
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzSpeTri0.2008-05-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    doBlastzChainNet.pl `pwd`/DEF -bigClusterHub=pk  \
      -chainMinScore=3000 -chainLinearGap=medium >& do.log &

    ln -s blastzSpeTri0.2008-05-16 /cluster/data/mm9/bed/blastz.speTri0

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /cluster/data/mm9/bed/blastz.speTri0
    /cluster/bin/scripts/doRecipBest.pl mm9 speTri0 >&! rbest.log &

    # failed coverage check, shouldn't be fatal ?
    # resume creating axt's and maf's
    # use axtChain/doRecipBest.csh to create resume.csh

    ssh kkstore06
    cd /cluster/data/mm9/bed/blastz.speTri0/axtChain
    csh resume.csh >&! resume.log &

    ssh hgwdev
    cd /cluster/data/mm9/bed/blastz.speTri0
    featureBits mm9 chainSpeTri0Link > fb.mm9.chainSpeTri0Link.txt
    cat fb.mm9.chainSpeTri0Link.txt
    # 673393210 bases of 2620346127 (25.699%) in intersection

#################
# Rodent multiz (mouse, guinea pig, ground squirrel) 
# for Jurgen Schmitz (2008-06-07 kate)
# Redo with unfiltered net mafs, to maximize squirrel sequence

    ssh kkstore06
    mkdir /cluster/data/mm9/bed/multiz3way
    cd /cluster/data/mm9/bed/multiz3way
    mkdir mafLinks
    mkdir mafLinks/cavPor3
    cd mafLinks/cavPor3
    # high quality mammalian genome, so use syntenic net
    ln -s ../../../blastz.cavPor3/mafSynNet/*.maf.gz .
    mkdir ../speTri0
    cd ../speTri0
    # low coverage genome, so use reciprocal best
    #ln -s ../../../blastz.speTri0/mafRBestNet/*.maf.gz .
    # redo with unfiltered, to get more squirrel sequence
    ln -s ../../../blastz.speTri0/maftNet/*.maf.gz .

    #	Copy MAFs to kluster-friendly disk 
    mkdir -p /san/sanvol1/scratch/mm9/multiz3way
    cd /san/sanvol1/scratch/mm9/multiz3way
    rsync -a --copy-links --progress \
	/cluster/data/mm9/bed/multiz3way/mafLinks/ .

    # get latest PSU utilities
    mkdir penn
    set p=/cluster/bin/penn/multiz.v11.2007-03-19/multiz-tba
    cp -p $p/{autoMZ,multiz,maf_project} penn

    # the autoMultiz cluster run
    ssh pk
    cd /cluster/data/mm9/bed/multiz3way

    # create species list and stripped down tree for autoMZ
    cat > tree.nh << 'EOF'
((mm9 cavPor3) speTri0)
'EOF'
    cat > species.lst << 'EOF'
mm9 cavPor3 speTri0
'EOF'
    mkdir run maf
    cd run

    cat > autoMultiz << '_EOF_'
#!/bin/csh -ef
set db = mm9
set c = $1
set maf = $2
set binDir = /san/sanvol1/scratch/$db/multiz3way/penn
set tmp = /scratch/tmp/$db/multiz.$c
set pairs = /san/sanvol1/scratch/$db/multiz3way
rm -fr $tmp
mkdir -p $tmp
cp ../{tree.nh,species.lst} $tmp
pushd $tmp
foreach s (`cat species.lst`)
    set in = $pairs/$s/$c.maf
    set out = $db.$s.sing.maf
    if ($s == $db) then
	continue
    endif
    if (-e $in.gz) then
	zcat $in.gz > $out
    else if (-e $in) then
	cp $in $out
    else
	echo "##maf version=1 scoring=autoMZ" > $out
    endif
end
set path = ($binDir $path); rehash
$binDir/autoMZ + T=$tmp E=$db "`cat tree.nh`" $db.*.sing.maf $c.maf
popd
cp $tmp/$c.maf $maf
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x autoMultiz

cat  << '_EOF_' > template
#LOOP
./autoMultiz $(root1) {check out line+ /cluster/data/mm9/bed/multiz3way/maf/$(root1).maf}
#ENDLOOP
'_EOF_'
    # << happy emacs

    awk '{print $1}' /cluster/data/mm9/chrom.sizes > chrom.lst
    gensub2 chrom.lst single template jobList
    para create jobList
    # 35 jobs
    para try

    para check

#Completed: 35 of 35 jobs
#CPU time in finished jobs:       6086s     101.43m     1.69h    0.07d  0.000 y
#IO & Wait Time:                   240s       4.00m     0.07h    0.00d  0.000 y
#Average job time:                 181s       3.01m     0.05h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             502s       8.37m     0.14h    0.01d
#Submission to last job:           506s       8.43m     0.14h    0.01d

    ssh hgwdev
    cd /usr/local/apache/htdocs/goldenPath/mm9
    mkdir multizRodent3way
    cd multizRodent3way
    ln -s /cluster/data/mm9/bed/multiz3way/maf .
    cat > README.txt << 'EOF'
This directory contains multiple alignments of 2 rodent genome
assemblies to the mouse genome (mm9, Mar. 2006):

    _ guinea pig         Cavia porcellus                Feb. 2008, cavPor3
    _ ground squirrel    Spermophilus tridecemlineatus  Jun. 2006, speTri0

'EOF'
# << emacs

############################################################################
# TRANSMAP vertebrate.2008-05-20 build  (2008-05-24 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-05-20

see doc/builds.txt for specific details.
############################################################################
############################################################################
# TRANSMAP vertebrate.2008-06-07 build  (2008-06-30 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2008-06-30

see doc/builds.txt for specific details.
############################################################################

#########################################################################
# ORegAnno - Open Regulatory Annotations
# loaded July 7, 2008
# updated Sept 29, 2008
# loaded by Belinda Giardine, in same manner as hg18 ORegAnno track


############################################################################
# JAX/MGI TRACKS (DONE 6/22/11 angie)
# Previously done 9/20/10 in /hive/data/genomes/mm9/bed/jax/2010_09 (pushed)
# Previously done 8/20/09 in /hive/data/genomes/mm9/bed/jax/2009_08 (pushed)
# Previously done 6/11/09 in /hive/data/genomes/mm9/bed/jax/2009_06 (pushed)
# Previously done 4/24/09 in /hive/data/genomes/mm9/bed/jax/2009_04 (not pushed)
# Previously done 9/24/08 in /cluster/data/mm9/bed/jax/2008_09
    mkdir -p /hive/data/genomes/mm9/bed/jax/2011_06
    cd /hive/data/genomes/mm9/bed/jax/2011_06
    wget --timestamping ftp://ftp.informatics.jax.org/pub/gbrowse/\*
    wget --timestamping ftp://ftp.informatics.jax.org/pub/reports/MGI_PhenotypicAllele.rpt
    # Oops, June 2011 has a file that ends in "gff" not ".gff":
    mv MP_0010768_mortalitygff MP_0010768_mortality.gff
    # And June 2011 got some stale files from the ftp site:
    rm -f MP_0005393_skin_coat_nails.gff \
      MP_0005392_touch_vibrissae.gff \
      MP_0005374_lethality-embryonic_perinatal.gff \
      MP_0005373_lethality-postnatal.gff \
      MP_0005372_life_span-post-weaning_aging.gff

    # Jax Rep Transcript track
    # SEQ_RepTransGenomic_rpt.gff --> jaxRepTranscript{,Alias}
    # -- names like AK016604_4933401J01Rik, NM_001011874_AY534250
    # -- aliases ~ MGI:\d+
    ~/kent/src/hg/jaxMgi/parseRepTranscript.pl SEQ_RepTransGenomic_from_models.gff \
      > jaxRepTranscript.gff

    # Jax Allele track
    # AL_*.gff --> jaxAllele{,Info}
    # -- bed12Source -- add type from filename
    # -- names like NM_011283_Rp1h<tm1Jnz>, XM_129721_Slc9a2<tm1Ges>
    # -- Info: name, mgiID, source {"Gene trapped", ...}
    rm -f jaxGeneTrap.bed jaxAlleleInfo.tab fixJaxAllele.sql
    foreach f (MGI_GT[DR]NA_GBrowse.gff)
      echo $f:t:r | sed -e 's/MGI_//; s/_GBrowse//'
      ~/kent/src/hg/jaxMgi/parseAllele.pl $f \
      | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | perl -wpe 'chomp; @w = split; $w[3] =~ s/\|\|(\w+)$// || die;  $source = $1; \
                   $w[8] = ($source eq "GeneTrappedDna") ? "218,112,214" : \
                           ($source eq "GeneTrappedRna3") ? "50,205,50" : "25,25,112"; \
                   $_ = join("\t", @w, $source) . "\n";' \
      >> jaxGeneTrap.bed
      if ($status) then
        echo "\nERRORS - FixMe\n"
        break
      endif
    end
    mv jaxAlleleInfo.tab jaxGeneTrapInfo.tab
    mv fixJaxAllele.sql fixJaxGeneTrap.sql
    cut -f 13 jaxGeneTrap.bed | sort | uniq -c
# 298285 GeneTrappedDna
# 186042 GeneTrappedRna3
#  35389 GeneTrappedRna5
    rm -f jaxAllele.bed jaxAlleleInfo.tab fixJaxAllele.sql
    foreach f (AL_{IND,OTHER,SPON,TARG,TRANS}.gff)
      echo $f:t:r | sed -e 's/AL_//;'
      ./parseAllele.pl $f \
      | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | perl -wpe 'chomp; @w = split; $w[3] =~ s/\|\|(\w+)$// || die; \
                   $_ = join("\t", @w, $1) . "\n";' \
      >> jaxAllele.bed
      if ($status) then
        echo "\nERRORS - FixMe\n"
        break
      endif
    end
    cut -f 13 jaxAllele.bed | sort | uniq -c
#TARG
#Missing > for mRNA name NM_016893_Fut8<tm1Nt
#   1333 Induced
#     98 Other
#   1181 Spontaneous
#  14250 Targeted
#     39 Transgenic

    # Jax Phenotype track
    # MP_*.gff --> jaxPhenotype{,Alias}
    # -- bed12Source -- add type from filename
    # -- names like NM_001001488_Atp8b1
    rm -f jaxPhenotype.bed jaxPhenotypeAlias.tab fixJaxPhenotype.sql
    foreach f (MP_*.gff)
      set type = `echo $f:t:r \
        | perl -wpe 's/MP_[0-9]*_//; s/[_-](\w)/\u$1/g; s/^(\w)/\u$1/; \
                    s@AdiposeTissue@Adipose@ || \
                    s@BehaviorNeurological@Behavior@ || \
                    s@CardiovascularSystem@Cardiovascular@ || \
                    s@DigestiveAlimentary@Digestive@ || \
                    s@EndocrineExocrineGland@Gland@ || \
                    s@GrowthSize@Growth Size@ || \
                    s@HearingEar@Hearing/Ear@ || \
                    s@HematopoieticSystem@Hematopoietic@ || \
                    s@HomeostasisMetabolism@Homeostasis@ || \
                    s@ImmuneSystem@Immune@ || \
                    s@LimbsDigitsTail@Limbs and Tail@ || \
                    s@LiverBiliarySystem@Liver and Bile@ || \
                    s@NervousSystem@Nervous System@ || \
                    s@RenalUrinarySystem@Renal/Urinary@ || \
                    s@ReproductiveSystem@Reproductive@ || \
                    s@RespiratorySystem@Respiratory@ || \
                    s@TasteOlfaction@Taste/Smell@ || \
                    s@Tumorigenesis@Tumorigenesis@ || \
                    s@VisionEye@Vision/Eye@ || \
                    m/^Craniofacial|Cellular|Embryogenesis|Integument|Mortality|Muscle|Normal|Other|Pigmentation|Skeleton$/ || \
                    die "\n\nUnrecognized phenotype $_\n\n\t";'`
      echo $type
      if ("$type" == "") break
      ~/kent/src/hg/jaxMgi/parsePhenotype.pl $f \
      | ldHgGene mm9 placeholder stdin -nobin -out=stdout \
      | /cluster/bin/scripts/genePredToBed \
      | sed -e 's/^/chr/; s@$@'"\t$type"'@;' \
      >> jaxPhenotype.bed
    end
    sort -u jaxPhenotypeAlias.tab > tmp
    mv tmp jaxPhenotypeAlias.tab

    # Jax QTL track
    # QTL*.gff --> jaxQtl2 (or 3?)... but we're missing MIT SSLP marker
    # and CM distance for 2, or those plus flanking markers for 3...
    dos2unix MGI_QTL.gff
    # Compare against the previous update to see if we need to reload:
    if (`cmp MGI_QTL.gff ../2010_09/MGI_QTL.gff` != 0) then
      echo MGI_QTL.gff changed, updating...
      perl -wpe 'chomp; s/\s*$//; \
        ($c, undef, undef, $start, $end, undef, $strand, undef, $info) = \
          split("\t"); \
        if ($info =~ /QTL (\S+); Dbxref "(MGI:\d+)"; Alias .*; Note "([^"]+)"/) { \
          ($name, $mgiID, $desc) = ($1, $2, $3); \
        } else { die "parse\n$info"; } \
        if ($start > $end) { $tmp = $end; $end = $start; $start = $tmp; } \
        $start-- unless $start == 0; \
        s/^.*$/chr$c\t$start\t$end\t$name\t1000\t$strand\t\t$mgiID\t$desc\t0.0\n/;' \
        MGI_QTL.gff > jaxQtl.bed
    else
      echo No change to MGI_QTL.gff
    endif
#MGI_QTL.gff ../2010_09/MGI_QTL.gff differ: char 99310, line 780

    # Extract phenotype-allele relationships:
    # Make a file for the one code not already in a filename:
    cp /dev/null MP_0003012_no_phenotypic_analysis
    # Wrote a script to extract the phenotype-allele relationships --
    # it uses the filenames to map MP:* codes to our phenotype names.
    ~/kent/src/hg/jaxMgi/parsePhenotypicAllele.pl MGI_PhenotypicAllele.rpt > jaxAllelePheno.tab
    # The file "err" has messages about missing data (no gene name in 
    # PhenotypicAllele.rpt, or gene/mgiId not found in jaxAlleleInfo).
    wc -l jaxAllelePheno.tab err
#  15147 jaxAllelePheno.tab
#  11778 err

    # Load tables
    # jaxRepTranscript
    ldHgGene mm9 jaxRepTranscript jaxRepTranscript.gff
#  38436 groups 22 seqs 1 sources 1 feature types
    hgsql mm9 < fixJaxRepTranscript.sql
    hgLoadSqlTab mm9 jaxRepTranscriptAlias \
      ~/kent/src/hg/lib/genericAlias.sql jaxRepTranscriptAlias.tab
    checkTableCoords mm9 jaxRepTranscript
    # jaxAllele
    hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
      mm9 jaxAllele jaxAllele.bed
#Loaded 16901 elements of size 13
    checkTableCoords mm9 jaxAllele
    # fixJaxAllele.sql is empty so don't need to do this:
    # hgsql mm9 < fixJaxAllele.sql
    hgLoadSqlTab mm9 jaxAlleleInfo \
      ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxAlleleInfo.tab
    # jaxGeneTrap
    hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
      mm9 jaxGeneTrap jaxGeneTrap.bed
#Loaded 519716 elements of size 13
    checkTableCoords mm9 jaxGeneTrap
    hgsql mm9 < fixJaxGeneTrap.sql
    hgLoadSqlTab mm9 jaxGeneTrapInfo \
      ~/kent/src/hg/lib/jaxAlleleInfo.sql jaxGeneTrapInfo.tab
    # jaxPhenotype
    hgLoadBed -renameSqlTable -sqlTable=$HOME/kent/src/hg/lib/bed12Source.sql \
      -tab mm9 jaxPhenotype jaxPhenotype.bed
#Loaded 37122 elements of size 13
    checkTableCoords mm9 jaxPhenotype
    # fixJaxPhenotype.sql is empty so don't need to execute it.
    # hgsql mm9 < fixJaxPhenotype.sql
    hgLoadSqlTab mm9 jaxPhenotypeAlias \
      ~/kent/src/hg/lib/genericAlias.sql jaxPhenotypeAlias.tab
    # jaxQtl
    if (`cmp MGI_QTL.gff ../2010_09/MGI_QTL.gff` != 0) then
      echo MGI_QTL.gff changed, updating...
      hgLoadBed -tab -notItemRgb -noBin \
        -sqlTable=$HOME/kent/src/hg/lib/jaxQtl.sql \
        mm9 jaxQtl jaxQtl.bed
#Loaded 1890 elements of size 10
    endif
    checkTableCoords -verbose=2 mm9 jaxQtl
#mm9.jaxQtl item Ath13 chr14:51915898-165887941: chromEnd > chromSize 125194864
#mm9.jaxQtl item Ity2 chr11:145756703-145756947: chromEnd > chromSize 121843856
    hgsql mm9 -e 'update jaxQtl set chromEnd = 125194864 where chrom = "chr14" and chromEnd = 165887941'
    hgsql mm9 -e 'delete from jaxQtl where chrom = "chr11" and chromStart > 121843856'
    checkTableCoords -verbose=2 mm9 jaxQtl
    # No output, good.
    # phenotype-allele relationships
    hgLoadSqlTab mm9 jaxAllelePheno \
      ~/kent/src/hg/lib/jaxAllelePheno.sql jaxAllelePheno.tab

    # Check joiner (noTimes to avoid flood of refGene/knownGene timestamp warnings):
    foreach t (jaxRepTranscript jaxPhenotype jaxGeneTrap)
      echo $t
      runJoiner.csh mm9 $t noTimes
    end


############################################################################
# WOLD RNA-seq
#
# wig files:  bed format, 25mers
ave mm9Brain.wig
#min 1, max=12989, median, 6
#7.4M reads

woldRnaSeqBrain

##########################################################################
# Fix equCab2 nets and chains to remove duplicate scaffold_34 (DONE - 2008-08-19 - larrym)

fixChainNetEquCab2 hg18

deleted:	3100 from chr1_chainEquCab2
deleted:	7362 from chr10_chainEquCab2
deleted:	8472 from chr11_chainEquCab2
deleted:	1078 from chr12_chainEquCab2
deleted:	2227 from chr13_chainEquCab2
deleted:	2 from chr13_random_chainEquCab2
deleted:	3605 from chr14_chainEquCab2
deleted:	6773 from chr15_chainEquCab2
deleted:	3400 from chr16_chainEquCab2
deleted:	0 from chr16_random_chainEquCab2
deleted:	3741 from chr17_chainEquCab2
deleted:	3 from chr17_random_chainEquCab2
deleted:	334 from chr18_chainEquCab2
deleted:	5620 from chr19_chainEquCab2
deleted:	5 from chr1_random_chainEquCab2
deleted:	23003 from chr2_chainEquCab2
deleted:	1265 from chr3_chainEquCab2
deleted:	0 from chr3_random_chainEquCab2
deleted:	2567 from chr4_chainEquCab2
deleted:	0 from chr4_random_chainEquCab2
deleted:	967 from chr5_chainEquCab2
deleted:	0 from chr5_random_chainEquCab2
deleted:	3419 from chr6_chainEquCab2
deleted:	10493 from chr7_chainEquCab2
deleted:	0 from chr7_random_chainEquCab2
deleted:	1284 from chr8_chainEquCab2
deleted:	1 from chr8_random_chainEquCab2
deleted:	10185 from chr9_chainEquCab2
deleted:	1 from chr9_random_chainEquCab2
deleted:	4 from chrM_chainEquCab2
deleted:	8 from chrUn_random_chainEquCab2
deleted:	1585 from chrX_chainEquCab2
deleted:	3 from chrX_random_chainEquCab2
deleted:	19 from chrY_chainEquCab2
deleted:	70 from chrY_random_chainEquCab2
deleted:	18173 from netEquCab2

#########################################################################
# BLASTZ/CHAIN/NET oryLat2 (DONE - 2008-08-25,27 - Hiram)
    ssh kkstore06
    screen	# use a screen to manage this longish running job
    mkdir /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
    cd /cluster/data/mm9/bed/blastzOryLat2.2008-08-25
    cat << '_EOF_' > DEF
# Mouse vs. Medaka
BLASTZ=/cluster/bin/penn/x86_64/lastz

# typical parameters for a genome that is distant from human
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=1

# QUERY: Medaka oryLat2 (40M chunks covers the largest chroms in one gulp)
SEQ2_DIR=/scratch/data/oryLat2/oryLat2.2bit
SEQ2_LEN=/scratch/data/oryLat2/chrom.sizes
SEQ2_CHUNK=40000000
SEQ2_LIMIT=200
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/blastzOryLat2.2008-08-25
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust \
	-bigClusterHub=pk -verbose=2 > do.log 2>&1 &
    #	real    124m28.816s
    #	problems with memk today, continuing:
    time doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-continue=cat -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > cat.log 2>&1 &
    #	the kluster is acting up, took several attempts to get one of the
    #	simple cat jobs done, not sure why it was having trouble, continuing:
    time doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-continue=chainRun -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainRun.log 2>&1 &
    time doBlastzChainNet.pl `pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -continue=chainMerge -qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk -verbose=2 > chainMerge.log 2>&1 &
    #	real    14m58.355s
    cat fb.mm9.chainOryLat2Link.txt
    #	50975949 bases of 2620346127 (1.945%) in intersection

    cd /cluster/data/mm9/bed
    ln -s blastzOryLat2.2008-08-25 blastz.oryLat2
    
    #	That is OK, now for the swap:
    mkdir /cluster/data/oryLat2/bed/blastz.mm9.swap
    cd /cluster/data/oryLat2/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 -swap \
	/cluster/data/mm9/bed/blastzOryLat2.2008-08-25/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-qRepeats=windowmaskerSdust \
	-smallClusterHub=pk -bigClusterHub=pk > swap.log 2>&1 &
    #	real    15m26.642s
    cat fb.oryLat2.chainMm9Link.txt
    #	45837267 bases of 700386597 (6.545%) in intersection

#######################################
# Wold RNA-seq data (Done Jul 30 mikep)
#

df .
#Filesystem           1K-blocks      Used Available Use% Mounted on
#kkstore06-10:/export/cluster/store4
#                     2402304448 2183573728  96700640  96% /cluster/store4
ssh kkstore06
cd /cluster/store4/mm9/bed/woldRnaSeq/

# naming convention: woldRnaSeq (Signal) Tissue Replicate

# rename input wigs to convention
mv mm9Brain.wig   woldRnaSeqSignalBrain1.wigbed
mv mm9Brain2.wig  woldRnaSeqSignalBrain2.wigbed
mv mm9Liver.wig   woldRnaSeqSignalLiver1.wigbed
mv mm9Liver2.wig  woldRnaSeqSignalLiver2.wigbed
mv mm9Muscle.wig  woldRnaSeqSignalMuscle1.wigbed
mv mm9Muscle2.wig woldRnaSeqSignalMuscle2.wigbed

# wigEncode it all
for T in Brain Liver Muscle
do 
  for R in 1 2
  do
  wigEncode woldRnaSeqSignal${T}${R}.wigbed woldRnaSeqSignal${T}${R}.wig woldRnaSeqSignal${T}${R}.wib
  done
done

#Converted woldRnaSeqSignalBrain1.wigbed, upper limit 12989.00, lower limit 1.00
#Converted woldRnaSeqSignalBrain2.wigbed, upper limit 1482.24, lower limit 0.04
#Converted woldRnaSeqSignalLiver1.wigbed, upper limit 44652.00, lower limit 1.00
#Converted woldRnaSeqSignalLiver2.wigbed, upper limit 2567.53, lower limit 0.06
#Converted woldRnaSeqSignalMuscle1.wigbed, upper limit 60949.00, lower limit 1.00
#Converted woldRnaSeqSignalMuscle2.wigbed, upper limit 2726.96, lower limit 0.06

# Load on hgwdev
ssh hgwdev

for T in Brain Liver Muscle
do 
  for R in 1 2
  do
  ln -s /cluster/data/mm9/bed/woldRnaSeq/woldRnaSeqSignal${T}${R}.wib /gbdb/mm9/wib/
  hgLoadWiggle mm9 woldRnaSeqSignal${T}${R} woldRnaSeqSignal${T}${R}.wig 
  done
done
rm wiggle.tab

# do the beds
for F in data/*beds*tgz
do
  echo "untaring $F"
  tar zxvf $F
done

# How many records in the beds?
 wc -l *bed
#   8868804 mm9Brain1.multi.bed
#    856281 mm9Brain1.splices.bed
#  14488584 mm9Brain1.uniqs.bed
#  16180919 mm9Brain2.multi.bed
#     54100 mm9Brain2.spike.bed
#   1570776 mm9Brain2.splices.bed
#  26519333 mm9Brain2.uniqs.bed
#  12794917 mm9Liver1.multi.bed
#   1030969 mm9Liver1.splices.bed
#  13133048 mm9Liver1.uniqs.bed
#  17783124 mm9Liver2.multi.bed
#    414618 mm9Liver2.spike.bed
#   1372984 mm9Liver2.splices.bed
#  17673014 mm9Liver2.uniqs.bed
#  12048985 mm9Muscle1.multi.bed
#   1150895 mm9Muscle1.splices.bed
#  13936012 mm9Muscle1.uniqs.bed
#  16033642 mm9Muscle2.multi.bed
#    589787 mm9Muscle2.spike.bed
#   1347749 mm9Muscle2.splices.bed
#  16632816 mm9Muscle2.uniqs.bed
# 194481357 total

# Just do the splices ones
for T in Brain Liver Muscle
do
  for R in 1 2
  do
  egrep -v "^track" mm9${T}${R}.splices.bed | gawk -v OFS="\t" '{print $1,$2,$3,$4,$5,$6,$2,$3,0,$10,$11,$12}' > woldRnaSeqSplices${T}${R}.bed
  hgLoadBed mm9 woldRnaSeqSplices${T}${R} woldRnaSeqSplices${T}${R}.bed
  done
done
rm bed.tab


#########################################################################
### Affy MOE430 version 2 (DONE - 2008-09-25,10-02 - Hiram)
    # Align probes from MOE430v2 chip.
    #	Data was picked up manually from the Affymetrix WEB site
    #	while logged in to the Affymetrix system, from the page:
# http://www.affymetrix.com/support/technical/byproduct.affx?product=moe430-20
    # found links to the following files:
-rw-r--r--  1  51429336 Dec  1  2003 Mouse430_2.probe_fasta
-rw-r--r--  1    163849 Dec  2  2003 Mouse430_2_control
-rw-r--r--  1  89662619 Dec  2  2003 Mouse430_2.consensus
-rw-r--r--  1  30999528 Dec  2  2003 Mouse430_2.target
-rw-r--r--  1  24828845 Jun 12  2006 Mouse430_2.link.psl
-rw-r--r--  1 119301329 Aug 18  2006 Mouse430_2_ortholog.csv
-rw-rw-rw-  1  95467111 Jul  7 22:05 Mouse430_2.na26.annot.csv
-rw-r--r--  1      3188 Jul  8 13:23 3prime-IVT.AFFX_README.NetAffx-CSV-Files.txt
    #	placed into: /hive/data/genomes/mm9/bed/affyMOE430v2/affyData

    #	The GNF folks pointed to data available at:
    #	http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE10246

    ssh memk
#     cat ../affyData/Mouse430_2.probe_fasta \
# 	| sed -e "s/probe:Mouse430_2:/MOE320v2_/; s/:.*//" > MOE430v2_probes.fa
#     cat ../affyData/Mouse430_2.target \
# 	| sed -e "s/target:Mouse430_2:/MOE320v2_/; s/;.*//" > MOE430v2_target.fa
    mkdir /hive/data/genomes/mm9/bed/affyMOE430v2/run
    cd /hive/data/genomes/mm9/bed/affyMOE430v2/run
    mkdir psl

    cut -f1 ../../../chrom.sizes > genome.list
    cat ../affyData/Mouse430_2.consensus \
	| sed -e "s/consensus:Mouse430_2://; s/;.*//" > affyMOE430v2.fa

    ls -1 /hive/data/genomes/mm9/bed/affyMOE430v2/run/affyMOE430v2.fa \
	> probe.list

    cat << '_EOF_' > template
#LOOP
blat -fine -ooc=/scratch/data/mm9/11.ooc /scratch/data/mm9/nib/$(path1).nib $(path2) {check out line+ psl/$(root1).psl}
#ENDLOOP
'_EOF_'
    # << happy emacs
    gensub2 genome.list probe.list template jobList
    para create jobList
    para try ... check ... push ... etc.
    para time
# Completed: 35 of 35 jobs
# CPU time in finished jobs:      22222s     370.36m     6.17h    0.26d  0.001 y
# IO & Wait Time:                   104s       1.74m     0.03h    0.00d  0.000 y
# Average job time:                 638s      10.63m     0.18h    0.01d
# Longest finished job:            1580s      26.33m     0.44h    0.02d
# Submission to last job:          1589s      26.48m     0.44h    0.02d

    # Do sort, best in genome filter, and convert to chromosome coordinates
    # to create gnf1h.psl.
    pslSort dirs raw.psl tmp psl
    pslReps -minCover=0.3 -minAli=0.95 -nearTop=0.005 raw.psl \
	../affyMOE430v2.psl /dev/null

    # Load probes and alignments from MOE430v2 into database.
    ssh hgwdev
    cd /hive/data/genomes/mm9/bed/affyMOE430v2
    mkdir /projects/compbio/data/microarray/affyMOE430v2
    cp -p run/affyMOE430v2.fa /projects/compbio/data/microarray/affyMOE430v2

    ln -s /projects/compbio/data/microarray/affyMOE430v2/affyMOE430v2.fa \
	/gbdb/hgFixed/affyProbes

    hgLoadPsl mm9 affyMOE430v2.psl
    hgLoadSeq mm9 /gbdb/hgFixed/affyProbes/affyMOE430v2.fa
    #	45037 sequences
    pslToBed affyMOE430v2.psl affyMOE430v2Probes.bed
    hgLoadBed -tmpDir=/scratch/tmp mm9 affyMOE430v2Probes affyMOE430v2Probes.bed
    Loaded 46193 elements of size 12
    # this is temporary, for use with bedMergeExpData below

    #	Create a similar formatted file to the one used in MOE430
    zcat geoData/GSE10246_series_matrix.txt.gz \
	| egrep "^\"1|source_name|Sample_title" \
	| sed -e "s/\!Sample_title/#Probe Set/; s#\!Sample_source_name_ch1##;" \
	| sed -e "s/\"//g" > gnfMOE430v2.AD.txt

    #	create gnfMouseAtlas3AllExps and gnfMouseAtlas3All tables in hgFixed
    hgGnfMicroarray gnfMouseAtlas3AllExps gnfMouseAtlas3All \
	gnfMOE430v2.AD.txt -chip=affyMOE430v2
    #	182 experiments
    #	from that table, create median ratio table
    # create table gnfMOE430v2AllRatio in hgFixed from hgFixed.gnfMOE430v2All
    #	and classification file ../hgMedianMicroarray/gnfMOE430v2.ra
    hgRatioMicroarray gnfMouseAtlas3All gnfMouseAtlas3AllRatio \
	-clump=$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra

    # add those ratio's to the probe locations to make a bed 15 microarray type
    bedMergeExpData hgFixed.gnfMouseAtlas3AllRatio mm9.affyMOE430v2Probes \
	gnfMouseAtlas3AllRatio.bed
    #	no longer need this table
    #	do not need this table for the genome browser display
    hgsql -e "drop table affyMOE430v2Probes;" mm9

    hgLoadBed mm9 gnfMouseAtlas3 gnfMouseAtlas3AllRatio.bed

    hgMapToGene mm9 gnfMouseAtlas3 knownGene \
	knownToGnfMouseAtlas3 '-type=bed 12'

    time hgExpDistance mm9 hgFixed.gnfMouseAtlas3AllRatio \
	hgFixed.gnfMouseAtlas3AllExps gnfMouseAtlas3Distance \
	-lookup=knownToGnfMouseAtlas3
    #	Have 45036 elements in hgFixed.gnfMouseAtlas3AllRatio
    #	Got 39872 unique elements in hgFixed.gnfMouseAtlas3AllRatio

    #	Loaded gnfMouseAtlas3Distance
    #	real    34m56.844s
    #	user    58m1.892s
    #	sys     1m44.821s

    # Take the median value over multiple replicants creating
    # hgFixed.gnfMouseAtlas3MedianRatio and gnfMouseAtlas3MedianExps
    cd ../hgMedianMicroarray
    hgMedianMicroarray hgFixed gnfMouseAtlas3AllRatio gnfMouseAtlas3AllExps \
	$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
	gnfMouseAtlas3MedianRatio gnfMouseAtlas3MedianExps -minExps=1

    # Also make a median version of the absolute measurements
    hgMedianMicroarray hgFixed gnfMouseAtlas3All gnfMouseAtlas3AllExps \
	$HOME/kent/src/hg/makeDb/hgMedianMicroarray/gnfMOE430v2.ra \
	gnfMouseAtlas3AllMedian gnfMouseAtlas3AllMedianExps -minExps=1

    time hgExpDistance mm9 hgFixed.gnfMouseAtlas3MedianRatio \
	hgFixed.gnfMouseAtlas3MedianExps gnfMouseAtlas3MedianDistance \
	-lookup=knownToGnfMouseAtlas3
# Have 45037 elements in hgFixed.gnfMouseAtlas3MedianRatio
# Got 39872 unique elements in hgFixed.gnfMouseAtlas3MedianRatio
XXX - working Mon Nov 24 10:01:43 PST 2008

    #	real    16m5.102s
    #	user    41m54.581s
    #	sys     1m28.595s


    #	182 experiments
    # Convert these to ratios using the median of medians of non-cancerous
    # cell types as the denominator as so:
    cd ~/src/hg/makeDb/hgRatioMicroarray
    cd ../hgMedianMicroarray
    # create tables gnfMOE430v2MedianRatio gnfMOE430v2MedianExps in hgFixed
    hgMedianMicroarray hgFixed gnfMOE430v2AllRatio gnfMOE430v2AllExps \
	gnfMOE430v2.ra gnfMOE430v2MedianRatio gnfMOE430v2MedianExps -minExps=1

    # Also make a median version of the absolute measurements
    #	create gnfMOE430v2Median
    hgMedianMicroarray hgFixed gnfMOE430v2All gnfMOE430v2AllExps \
	gnfMOE430v2.ra gnfMOE430v2Median gnfMOE430v2MedianExps -minExps=1

    cd /hive/data/genomes/mm9/bed/affyMOE430v2
    # Load up microarray track
    hgMapMicroarray gnfMOE430v2.bed hgFixed.gnfMOE430v2MedianRatio \
    	affyMOE430v2.psl
    #	Loaded 45037 rows of expression data from hgFixed.gnfMOE430v2MedianRatio
    #	Mapped 44106,  multiply-mapped 2087, missed 0, unmapped 931

    hgLoadBed mm9 gnfMOE430v2 gnfMOE430v2.bed
    #	Loaded 46193 elements of size 15

#######################################
    hgExpDistance mm9 hgFixed.gnfMouseAtlas2MedianRatio \
	hgFixed.gnfMouseAtlas2MedianExps gnfAtlas2Distance -lookup=knownToGnf1m
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1

############################################################################
# hgPal downloads
    ssh hgwdev
    screen
    bash
    rm -rf /cluster/data/mm9/bed/multiz30way/pal
    mkdir /cluster/data/mm9/bed/multiz30way/pal
    cd /cluster/data/mm9/bed/multiz30way/pal
    cat > order.lst <<EOF
    mm9
    rn4
    cavPor2
    oryCun1
    hg18
    panTro2
    rheMac2
    ponAbe2
    calJac1
    otoGar1
    tupBel1
    sorAra1
    eriEur1
    canFam2
    felCat3
    equCab1
    bosTau3
    dasNov1
    loxAfr1
    echTel1
    monDom4
    ornAna1
    galGal3
    anoCar1
    xenTro2
    gasAcu1
    danRer5
    tetNig1
    fr2
    oryLat1
EOF

    mz=multiz30way
    gp=refGene
    db=mm9
    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.jobs

    time sh -x $gp.jobs > $gp.jobs.log 2>&1 & 
    sleep 1
    tail -f $gp.jobs.log

# real    196m7.752s
# user    11m26.917s
# sys     3m41.587s

    zcat exonAA/*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    # we're only distributing exons at the moment
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    mz=multiz30way
    gp=knownGene
    db=mm9

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -chrom=$j  $db $mz $gp order.lst stdout | \
	    gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -chrom=$j -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -chrom=$j -exons -noTrans $db $mz $gp order.lst stdout | \
	    gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -chrom=$j -exons $db $mz $gp order.lst stdout | \
	    gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & 
    sleep 1
    tail -f $gp.$mz.job.log

# real    216m43.721s
# user    18m33.552s
# sys     5m42.639s

    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz

    # now do the canonical set
    cd /cluster/data/mm9/bed/multiz30way/pal
    mz=multiz30way
    gp=knownCanonical
    db=mm9
    for j in `awk '{print $1}' /cluster/data/mm9/chrom.sizes`
    do
	echo "select chrom, chromStart, chromEnd, transcript from knownCanonical where chrom='$j'" | hgsql $db | tail -n +2 > $j.known.bed
    done

    mkdir exonAA exonNuc ppredAA ppredNuc
    for j in `sort -nk 2 /cluster/data/$db/chrom.sizes | awk '{print $1}'`
    do
	echo "date"
	echo "mafGene -geneBeds=$j.known.bed  $db $mz knownGene order.lst stdout | gzip -c > ppredAA/$j.ppredAA.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -noTrans $db $mz knownGene order.lst stdout | gzip -c > ppredNuc/$j.ppredNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons -noTrans $db $mz knownGene order.lst stdout | gzip -c > exonNuc/$j.exonNuc.fa.gz"
	echo "mafGene -geneBeds=$j.known.bed -exons $db $mz knownGene order.lst stdout | gzip -c > exonAA/$j.exonAA.fa.gz"
    done > $gp.$mz.jobs

    time sh -x $gp.$mz.jobs > $gp.$mz.job.log 2>&1 & 
    sleep 1
    tail -f $gp.$mz.job.log

# real    192m17.168s
# user    10m28.659s
# sys     3m53.467s

    rm *.known.bed
    zcat exonAA/c*.gz | gzip -c > $gp.$mz.exonAA.fa.gz
    zcat exonNuc/c*.gz | gzip -c > $gp.$mz.exonNuc.fa.gz
    zcat ppredAA/c*.gz | gzip -c > $gp.$mz.ppredAA.fa.gz
    zcat ppredNuc/c*.gz | gzip -c > $gp.$mz.ppredNuc.fa.gz

    rm -rf exonAA exonNuc ppredAA ppredNuc

    db=mm9
    mz=multiz30way
    gp=knownCanonical
    pd=/usr/local/apache/htdocs/goldenPath/$db/$mz
    ln -s `pwd`/$gp.$mz.exonAA.fa.gz $pd/$gp.exonAA.fa.gz
    ln -s `pwd`/$gp.$mz.exonNuc.fa.gz $pd/$gp.exonNuc.fa.gz


#############################################################################
# MOUSE TISSUE EXON ARRAYS (Melissa Cline, cline@biology.ucsc.edu, 10/14/08)  
# (to build the affyExonTissues track, see the steps outlined in hg18.txt)
#############################################################################

########################################################################
## AFFY ALL EXON PROBESETS (HG18/MM9/RN4) (DONE 2009-01-29, Andy)
## (instructions are in hg18.txt)
########################################################################

################################################
# AUTOMATE UPSTREAM FILE CREATION (2008-10-15 markd)
update genbank.conf:
mm9.upstreamGeneTbl = refGene
mm9.upstreamMaf = multiz30way /hive/data/genomes/mm9/bed/multiz30way/species.list


#############################################################################
# MAKE PCR TARGET FOR UCSC GENES (DONE 11/4/08) (REDONE 2/24/11)
    ssh hgwdev
    mkdir /cluster/data/mm9/bed/mrnaPcr
    cd /cluster/data/mm9/bed/mrnaPcr
    genePredToBed /cluster/data/mm9/bed/ucsc.12/ucscGenes.gp > ucscGenes.bed
    hgsql mm9 -NBe 'select kgId,geneSymbol from kgXref' \
    | perl -wpe 's/^(\S+)\t(\S+)/$1\t${1}__$2/ || die;' \
      > idSub.txt
    subColumn 4 ucscGenes.bed idSub.txt ucscGenesIdSubbed.bed
    sequenceForBed -keepName -db=mm9 -bedIn=ucscGenesIdSubbed.bed \
      -fastaOut=stdout \
    | faToTwoBit stdin kgTargetSeq.2bit
    cut -f 1-10 /cluster/data/mm9/bed/ucsc.12/ucscGenes.gp \
    | genePredToFakePsl mm9 stdin kgTargetAli.psl /dev/null

    # Load up the UCSC Genes target PSL table and put 2bit in /gbdb::
    cd /cluster/data/mm9/bed/mrnaPcr
    hgLoadPsl mm9 kgTargetAli.psl
    mkdir /gbdb/mm9/targetDb
    ln -s /cluster/data/mm9/bed/mrnaPcr/kgTargetSeq.2bit /gbdb/mm9/targetDb/kgTargetSeq12.2bit

    # Ask cluster-admin to start an untranslated, -stepSize=5 gfServer on
    # /gbdb/mm9/targetDb/kgTargetSeq12.2bit .

    ssh hgwdev
    # Add records to hgcentraltest blatServers and targetDb:
    hgsql hgcentraltest -e \
      'INSERT into blatServers values ("mm9Kg", "blat13", 17805, 0, 1);'
    hgsql hgcentraltest -e \
      'INSERT into targetDb values("mm9Kg", "UCSC Genes", \
         "mm9", "kgTargetAli", "", "", \
         "/gbdb/mm9/targetDb/kgTargetSeq.2bit", 1, now(), "");'


#############################################################################
# TEST BLASTZ with Rn5 (DONE - 2008-11-26,30 - Hiram)
    mkdir /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
    cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26

    cat << '_EOF_' > DEF
# mouse vs rat
# Specially tuned blastz parameters from Webb Miller

BLASTZ=blastz
BLASTZ_ABRIDGE_REPEATS=0
BLASTZ_O=600
BLASTZ_E=55
BLASTZ_Y=15000
BLASTZ_T=2
BLASTZ_K=4500
BLASTZ_Q=/scratch/data/blastz/mouse_rat.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rat Rn5
SEQ2_DIR=/scratch/data/rn5/rn5.2bit
SEQ2_LEN=/scratch/data/rn5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    cd /hive/data/genomes/mm9/bed/blastzRn5.2008-11-26
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=medium \
	-stop=net `pwd`/DEF > do.log 2>&1 &
    #	real    403m22.371s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
	-debug -chainMinScore=5000 -chainLinearGap=medium \
	-continue=load -stop=load `pwd`/DEF > load.log 2>&1 &
    #	real    44m59.528s
    cat fb.mm9.chainRn5BlastzLink.txt
    #	1751593467 bases of 2620346127 (66.846%) in intersection
    cat /cluster/data/mm9/bed/blastzRn4.2007-08-31/fb.mm9.chainRn4Link.txt
    #	1713186474 bases of 2620346127 (65.380%) in intersection

    mkdir /hive/data/genomes/rn5/bed/blastz.mm9.swap
    cd /hive/data/genomes/rn5/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=medium \
	-swap -stop=net > swap.log 2>&1 &
    #	real    63m51.690s
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/blastzRn5.2008-11-26/DEF \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=pk \
	-chainMinScore=5000 -chainLinearGap=medium \
	-debug -swap -continue=load -stop=load > load.log 2>&1 &
    cat fb.rn5.chainMm9BlastzLink.txt
    #	1901280009 bases of 3372561689 (56.375%) in intersection

#############################################################################
# AFFY EXON PROBE LIFT MM8->MM9 (DONE, 2008-12-17 Andy)
    ssh hgwdev
    cd /hive/data/genomes/mm9/bed
    mkdir affyMoEx1
    cd affyMoEx1/
    echo "select * from affyMoEx1Probe" | \
       hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Probe.bed
    liftOver mm8.affyMoEx1Probe.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
       affyMoEx1Probe.bed unmapped.txt
    grep Partially unmapped.txt | wc -l
#199
    grep Split unmapped.txt | wc -l
#190
    grep Deleted unmapped.txt | wc -l
#354
    wc -l mm8.affyMoEx1Probe.bed
#4549897
    ## Out of 4.5 million probes in mm8, we've lost 743 in different ways
    ## attempting to lift.  That's an acceptable number.
    hgLoadBed mm9 affyMoEx1Probe{,.bed}
    echo "select * from affyMoEx1Transcript" | \
       hgsql mm8 | tail +2 | cut -f2- > mm8.affyMoEx1Transcript.bed
    liftOver mm8.affyMoEx1Transcript.bed /gbdb/mm8/liftOver/mm8ToMm9.over.chain.gz \
       affyMoEx1Transcript.bed unmapped.txt
    hgLoadBed mm9 affyMoEx1Transcript{,.bed}
    ## Put unlifted IDs into a downloadable file.
    mkdir /usr/local/apache/htdocs/goldenPath/mm9/unlifted
    grep -A1 Deleted unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Deleted.bed
    grep -A1 Partially unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8PartiallyDeleted.bed
    grep -A1 Split unmapped.txt | grep "^chr" > affyMoEx1Probe.mm8Split.bed
    grep -A1 Deleted unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8Deleted.bed
    grep -A1 Partially unmappedTranscripts.txt | grep "^chr" > affyMoEx1Transcript.mm8PartiallyDeleted.bed
    cp affyMoEx1*.mm8*.bed /usr/local/apache/htdocs/goldenPath/mm9/unlifted
    ## mm8 and mm9 track descriptions differ: 
    ## 1. Copy mouse/trackDb.ra setting to mouse/mm9/trackDb.ra and add
    ##    origAssembly mm8 line.
    ## 2. Make a new paragraph in a new affyMouseExon.html in mm9 to include
    ##    details about the lift and how many didn't lift.

#############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2009-04-07)
    # bash  if not using bash shell already
    ssh kolossus
    mkdir /cluster/data/mm9/blastDb
    cd /cluster/data/mm9
    awk '{if ($2 > 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  mm9Chroms_RandomContigs.hard.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' mm9Chroms_RandomContigs.hard.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  mm9Chroms_RandomContigs.hard.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y 

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 2712

    mkdir -p /cluster/data/mm9/bed/tblastn.hg18KG
    cd /cluster/data/mm9/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst

# 2712 query.lst

   # we want around 250000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(250000/`wc query.lst | awk '{print $1}'`\)

# 36727/(250000/2712) = 398.414496

   mkdir -p kgfa
   split -l 398 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
     done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/mm9/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/mm9/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then                  
            mv $3.tmp $3     
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0               
    fi                      
fi                         
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec
    exit 
    
    ssh swarm
    cd /cluster/data/mm9/bed/tblastn.hg18KG
    para create blastSpec
#    para try, check, push, check etc.

    para time


# Completed: 252216 of 252216 jobs
# CPU time in finished jobs:   14882096s  248034.93m  4133.92h  172.25d  0.472 y
# IO & Wait Time:               1019014s   16983.57m   283.06h   11.79d  0.032 y
# Average job time:                  63s       1.05m     0.02h    0.00d
# Longest finished job:             184s       3.07m     0.05h    0.00d
# Submission to last job:         15667s     261.12m     4.35h    0.18d

    ssh swarm
    cd /cluster/data/mm9/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 93 of 93 jobs
# CPU time in finished jobs:       5736s      95.59m     1.59h    0.07d  0.000 y
# IO & Wait Time:                 21289s     354.82m     5.91h    0.25d  0.001 y
# Average job time:                 291s       4.84m     0.08h    0.00d
# Longest finished job:             472s       7.87m     0.13h    0.01d
# Submission to last job:           496s       8.27m     0.14h    0.01d


    cd /cluster/data/mm9/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq > ../unliftBlastHg18KG.psl
    cd ..
    pslCheck unliftBlastHg18KG.psl
    liftUp -nohead temp.psl ../../jkStuff/mm9.contigs.lift carry unliftBlastHg18KG.psl 
    sort -T /tmp -k 14,14 -k 16,16n -k 17,17n temp.psl  > blastHg18KG.psl
    rm temp.psl
    pslCheck blastHg18KG.psl

    # load table 
    ssh hgwdev
    cd /cluster/data/mm9/bed/tblastn.hg18KG
    hgLoadPsl mm9 blastHg18KG.psl

    # check coverage
    featureBits mm9 blastHg18KG 
# 30285278 bases of 2620346127 (1.156%) in intersection

    featureBits mm9 knownGene:cds blastHg18KG  -enrichment
# knownGene:cds 1.278%, blastHg18KG 1.156%, both 0.969%, cover 75.86%, enrich  65.64x

    featureBits mm9 refGene:cds blastHg18KG  -enrichment
# refGene:cds 1.205%, blastHg18KG 1.156%, both 0.940%, cover 78.04%, enrich 67.52x

    rm -rf blastOut
#end tblastn

#############################################################################
# LASTZ Swap Human Hg19 (DONE - 2009-05-14 - Hiram)
    #	the original
    cd /hive/data/genomes/hg19/bed/lastzMm9.2009-05-13
    cat fb.hg19.chainMm9Link.txt 
    #	1022734273 bases of 2897316137 (35.299%) in intersection

    #	and the swap
    mkdir /hive/data/genomes/mm9/bed/blastz.hg19.swap
    cd /hive/data/genomes/mm9/bed/blastz.hg19.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzMm9.2009-05-13/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    131m58.763s
    cat fb.mm9.chainHg19Link.txt 
    #	1013880568 bases of 2620346127 (38.693%) in intersection

#############################################################################
#  RE-BUILD miRNA TRACK (DONE, 2009-06-09-2009-06-11, hartera)
    # The miRNA track from miRBase is out of date so update the track. 
    mkdir -p /hive/data/genomes/mm9/bed/miRNA-2009-06-09
    cd /hive/data/genomes/mm9/bed/miRNA-2009-06-09
    # Download GFF file of latest miRNA annotations from miRBase at the
    # Wellcome Trust Sanger Institute (WTSI). This is Release 13.0.
    # (March 2009)
    wget --timestamping \
ftp://ftp.sanger.ac.uk/pub/mirbase/sequences/CURRENT/genomes/mmu.gff
    # Re-format, need to add "chr" to the beginning of each line.
    sed -e 's/^/chr/' mmu.gff > mmMirBaseFormat.gff
    # Remove extra "chr" in comment lines
    perl -pi.bak -e 's/chr#/#/' mmMirBaseFormat.gff
    # Change chrMT to chrM
    perl -pi.bak -e 's/chrMT/chrM/' mmMirBaseFormat.gff
    # Remove all but ID name in last field
    sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"/transcript_id=/g' \
       | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff

    # Load into database. 
    ldHgGene -exon=miRNA mm9 miRNARel13 mmMirBaseFormatIdOnly.gff
    # Does not load as mmu-mir-692-2 is on two chroms, chr4 and chr13.
    # These are alignments not genePreds so convert to BED for loading into
    # the database.
    sed -e 's/\";//g' mmMirBaseFormat.gff | sed -e 's/ID=\"//g' \
       | sed -e 's/ACC=\"MI[0-9]*\s//' > mmMirBaseFormatIdOnly.gff
    # chr1    .       miRNA   20669091        20669163        .       +
    # .       mmu-mir-206
    # use score 906 for + strand and 480 for - strand. This will show 
    # up black on the track for + strand and grey for - strand.
    # Re-do below and re-load track as appears off by 1 compared to 
    # Ensembl track and other miRNA resources (2009-06-11)
    # Confirmed with Sam Griffith-Jones that the coordinates in the 
    # GFF file are 1-based. (2009-06-12).
    awk 'BEGIN {FS="\t"} {OFS="\t"} \
        {if ($0 !~ /#/ && $7 == "+") print $1, $4-1, $5, $9, 960, $7; \
       else if ($0 !~ /#/ && $7 == "-") print $1, $4-1, $5, $9, 480, $7;}' \
        mmMirBaseFormatIdOnly.gff > mmMirBaseFormatIdOnly.bed
    # Remove previous table
    hgsql -e 'drop table miRNA' mm9
    hgLoadBed mm9 miRNA mmMirBaseFormatIdOnly.bed
# Reading mmMirBaseFormatIdOnly.bed
# Loaded 568 elements of size 6
# Sorted
# Creating table definition for miRNARel13
# Saving bed.tab
# Loading mm9
    hgsql -e 'select count(*) from miRNA;' mm9 
# 568
# The previous version had 493 miRNAs.
hgsql -e 'select count(distinct name) from miRNA;' mm9
# 541
# The previous version had 466 unique miRNAs. 

############################################################################
# Re-Run equCab2 alignment (DONE - 2009-06-29,07-02 - Hiram
    mkdir /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
    cd /hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29

    cat << '_EOF_' > DEF
# Mouse vs. Horse

BLASTZ_M=50

# TARGET: Mouse MM9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Horse
SEQ2_DIR=/scratch/data/equCab2/equCab2.2bit
SEQ2_LEN=/scratch/data/equCab2/chrom.sizes
SEQ2_CTGDIR=/hive/data/genomes/equCab2/equCab2.UnScaffolds.2bit
SEQ2_CTGLEN=/hive/data/genomes/equCab2/equCab2.UnScaffolds.sizes
SEQ2_LIFT=/hive/data/genomes/equCab2/jkStuff/equCab2.chrUn.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=100
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl `pwd`/DEF \
	-noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    360m10.094s
    time doBlastzChainNet.pl `pwd`/DEF \
	-continue=chainMerge -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > chainMerge.log 2>&1 &
    #	real    225m4.178s
    cat fb.mm9.chainEquCab2Link.txt 
    #	912421053 bases of 2620346127 (34.821%) in intersection

    mkdir /hive/data/genomes/equCab2/bed/blastz.mm9.swap
    cd /hive/data/genomes/equCab2/bed/blastz.mm9.swap
    time doBlastzChainNet.pl \
	/hive/data/genomes/mm9/bed/lastzEquCab2.2009-06-29/DEF \
	-swap -noLoadChainSplit -verbose=2 -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real  122m25.314s
    cat fb.equCab2.chainMm9Link.txt 
    #	902295813 bases of 2428790173 (37.150%) in intersection

############################################################################
############################################################################
# TRANSMAP vertebrate.2009-07-01 build  (2009-07-21 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-07-01

see doc/builds.txt for specific details.
############################################################################
# VEGA GENES UPDATE TO BUILD 35 (DONE, 2009-07-30 - 2009-09-09, hartera)
# Needs updating as the current version is build 31 from May 2008.
# 2009-08-03 (hartera) - Added code to register track handler for
# vegaGeneComposite.
# 2009-08-15 - 2009-08-16 (hartera) - Added code to allow use of radio buttons
# on the configuratio page for the track item labels. Modified code so it 
# can be shared with Ensembl to create the links to Vega transcript, gene
# and protein reports on the details pages. 
# 2009-08-22 - Finished code for adding Vega report URLs to the details pages.
# Loaded the vegaGtp table.
# 2009-09-01 and 2009-09-03 (hartera). Loaded a vegaPep table for the protein
# sequence link on the details pages.
# 2009-09-04 Re-load all tables as some reverted to the older version during
# mySQL 5 upgrade.
# 2009-09-08 - 2009-09-09 Code change to change message on details page when 
# no protein is available and change to trackDb to make vegaGene items a 
# darker blue colour. Reloaded vegaPep after removing proteins whose
# transcripts are not in vegaGtp to make all.joiner happy.

   mkdir -p /hive/data/genomes/mm9/bed/vega35
   cd /hive/data/genomes/mm9/bed/vega35
   # Download the VEGA genes for mouse from the ftp site
   # This file is from 03/17/09.
   wget --timestamping \
        "ftp://ftp.sanger.ac.uk/pub/vega/mouse/gtf_file.gz"
   # add chr in front of chromosome names and lift up the randoms
   #    processing similar to the same processing for Ensembl genes,
   #    from /cluster/data/mm9/bed/ensGene.49/process/doProcess.csh
   cp -p /cluster/data/mm9/bed/ensGene.49/process/randoms.mm9.lift .
   zcat gtf_file.gz \
        | sed -e "s/^\([0-9XY][0-9]*\)/chr\1/; s/^MT/chrM/" \
        | liftUp -type=.gtf stdout randoms.mm9.lift carry stdin \
        | gzip > allGenes.gtf.gz
   # Got 189 lifts in randoms.mm9.lift

   gtfToGenePred -infoOut=infoOut.txt -genePredExt allGenes.gtf.gz stdout \
	| gzip > mm9.allGenes.gp.gz
   /cluster/home/hartera/kent/src/hg/utils/automation/extractGtf.pl \
	infoOut.txt > ensGtp.tab
   genePredCheck -db=mm9 mm9.allGenes.gp.gz
   # checked: 59381 failed: 0
   zcat allGenes.gtf.gz | grep -i pseudo > pseudo.gtf
   zcat allGenes.gtf.gz | grep -v -i pseudo > not.pseudo.gtf
   
   # Modify the GTF files so that the gene name goes into the 
   # name2 field of the genePred. 
   perl -pi.bak -e 's/gene_id/other_gene_id/' *pseudo.gtf
   perl -pi.bak -e 's/gene_name/gene_id/' *pseudo.gtf
   gtfToGenePred -genePredExt pseudo.gtf pseudo.gp
   gtfToGenePred -genePredExt not.pseudo.gtf not.pseudo.gp

   genePredCheck -db=mm9 pseudo.gp
    # checked: 4305 failed: 0
   genePredCheck -db=mm9 not.pseudo.gp
    # checked: 55076 failed: 0

   hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
   hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp

   # clean up
   rm *.bak

   # 2009-08-03 (hartera)
   # Added code to src/hg/hgTracks/simpleTracks.c to register a track
   # handler for vegaGeneComposite that is now used for this data. This used
   # vegaGeneMethods to display the name2 field (gene) as the item label in
   # the track.

   # 2009-08-15 - 2009-08-16 (hartera)
   # Information extracted the attributes in the GTF file as ensGtp so 
   # change name to vegaGtp.
   mv ensGtp.tab vegaGtp.tab
   # ensGtp table definition is in ~/kent/src/hg/lib/ensGtp.sql
   # There is an index on the protein field so it can not be NULL. 
   # If there is no protein, the gene name is given.
   # Added code to hgTracks.c and hgTrackUi.c to allow the use of 
   # radio buttons on the track configuratioin page to select the
   # gene name, accession or both to be displayed in the track.
   # The gene name is displayed by default.
   # Added code to hgc.c so that Ensembl and Vega can share code to 
   # create links on the details pages to the Vega reports for transcript, 
   # gene and protein through these IDs. Created new function
   # printEnsemblOrVegaCustomUrl(). 

   # 2009-08-22 (hartera)
   # Loaded the vegaGtp table. Use ensGtp.sql to create the table.
   # vegaGtp associates geneId/transcriptId/proteinId 
   # for the links to Vega reports from the details page.
   cd /hive/data/genomes/mm9/bed/vega35
   cp ~/kent/src/hg/lib/ensGtp.sql .
   # 11 of the gene names for noncoding transcripts are too long for the 
   # protein ID field so change this field in ensGtp.sql to allow 40 chars 
   # instead of 20 and re-load the table.
   hgsql -e 'drop table vegaGtp;' mm9
   hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
   # Loaded succesfully
   # Added code to hgc.c to use printEnsemblOrVegaCustomUrl() in 
   # doVegaGene() to add the links to Vega reports on the details pages.
   # Code was added so that there is no protein sequence link on the details
   # page if it there is none available e.g. noncoding.
   # 2009-09-01 (hartera)
   # Coding genes are displaying the message that there is no protein
   # prediction available. Need to add a vegaPep table.
   cd /hive/data/genomes/mm9/bed/vega35
   # Download the protein FASTA file for Vega35
   wget --timestamping "ftp://ftp.sanger.ac.uk/pub/vega/mouse/pep/*.tot.fa.gz"
   # from the Ensembl process:
   zcat Mus_musculus.VEGA.mar.pep.tot.fa.gz  \
       | sed -e 's/^>.* Transcript:/>/;' | gzip > vegaPep.txt.gz
   zcat vegaPep.txt.gz \
       | ~/kent/src/utils/faToTab/faToTab.pl /dev/null /dev/stdin \
       | sed -e '/^$/d; s/*$//' | sort > vegaPep.mm9.fa.tab
   # Load table (2009-09-03, hartera)
   hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
   # Add vegaPep to the trackDb.ra entry for the vegaGeneComposite track 
   # in the type line for src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra.
   # Check that the vegaPep table looks ok and then check protein-coding and 
   # noncoding transcript details pages for protein links.
   
   # 2009-09-04, hartera
   # Re-load tables after upgrade to mySQL 5 as they had reverted back to 
   # tables with the previous Vega dataset.

   cd /hive/data/genomes/mm9/bed/vega35
   hgsql -e 'drop table vegaGene;' mm9
   hgsql -e 'drop table vegaPseudoGene;' mm9
   hgLoadGenePred -genePredExt mm9 vegaGene not.pseudo.gp
   hgLoadGenePred -genePredExt mm9 vegaPseudoGene pseudo.gp
   hgsql -e 'drop table vegaGtp;' mm9
   hgLoadSqlTab mm9 vegaGtp ensGtp.sql vegaGtp.tab
   hgsql -e 'drop table vegaPep;' mm9
   hgPepPred mm9 tab vegaPep vegaPep.mm9.fa.tab
   # 2009-09-08 (hartera). Changed message in code for details page when no
   # protein sequence is available to be more explanatory. "Non-protein
   # coding gene or gene fragment, no protein prediction available." Changed
   # the colouring for the vegaGene subtrack to be darker blue so there is 
   # more of a contrast between vegaGene and vegaPseudoGene subtracks.

   # 2009-09-09 (hartera) - re-loaded vegaPep table with only those proteins
   # that have a transcript ID in vegaGtp. 
   # all.joiner is complaining as there are about 1,000 extra proteins in 
   # vegaPep that do not have transcripts in vegaGtp. Decided to remove these
   # and e-mailed the HAVANA group to ask about the discrepancy. 
   cd /hive/data/genomes/mm9/bed/vega35
   awk '{print $2}' vegaGtp.tab | sort | uniq > vegaGtp.tx.ids
   awk '{print $1}' vegaPep.mm9.fa.tab | sort | uniq > vegaPep.tx.ids
   wc -l *.tx.ids
   # 59381 vegaGtp.tx.ids
   # 30956 vegaPep.tx.ids
   
   # Number of transcripts that have a protein ID:
   hgsql -Ne 'select transcript from vegaGtp where protein like "OTTMUSP%";' \
        mm9 | sort | uniq > vegaGtpWithProt.tx.ids
   wc -l vegaGtpWithProt.tx.ids        
   # 29902 vegaGtpWithProt.tx.ids
 
   # find those that are common to both. 
   comm -12 vegaGtp.tx.ids vegaPep.tx.ids > pepandGtp.tx.ids
   wc -l pepandGtp.tx.ids 
   # 29902 pepandGtp.tx.ids
   comm -12 pepandGtp.tx.ids vegaGtpWithProt.tx.ids | wc -l 
   # 29902
   # Therefore all the vegaGtp transcripts with a protein ID are in the
   # protein FASTA file.
   hgsql -Ne 'select * from vegaPep as p, vegaGtp as g where g.protein \
         like "OTTMUSP%" and p.name = g.transcript;' mm9 \
         > vegaPepOnlyInGtp.mm9.fa.tab
   wc -l vegaPepOnlyInGtp.mm9.fa.tab 
   # 29902 vegaPepOnlyInGtp.mm9.fa.tab  
   hgsql -e 'drop table vegaPep;' mm9
   hgPepPred mm9 tab vegaPep vegaPepOnlyInGtp.mm9.fa.tab
    
############################################################################
# Blastz Elephant loxAfr3 (DONE - 2009-08-12 - Hiram)
    mkdir /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12
    cd /hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12

    cat << '_EOF_' > DEF
# Mouse vs. Elephant
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Elephant loxAfr3
SEQ2_DIR=/scratch/data/loxAfr3/loxAfr3.2bit
SEQ2_LEN=/scratch/data/loxAfr3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/cluster/data/mm9/bed/lastzLoxAfr3.2009-08-12
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real 498m44.261s
    cat fb.mm9.chainLoxAfr3Link.txt
    #	684326090 bases of 2620346127 (26.116%) in intersection

    #	trying syntenic nets
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-continue=syntenicNet -syntenicNet > syntenicNet.log 2>&1 &
    #	about 20 minutes

    mkdir /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
    cd /hive/data/genomes/loxAfr3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzLoxAfr3.2009-08-12/DEF \
	-swap -noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet > swap.log 2>&1 &
    #	real    123m9.342s
    cat fb.loxAfr3.chainMm9Link.txt 
    #	673856452 bases of 3118565340 (21.608%) in intersection

#########################################################################
## NIA Mouse Gene Index - (DONE, Fan, 9/9/09)
# NOTE FOR NEXT TIME: this track fails pslCheck because every row in the
# NIAGene table has a tSize of 198000000.  Future tables should contain the
# proper chromosome lengths in the tSize field.  (Brooke, 2/22/10)
    ssh hgwdev 
    mkdir -p /cluster/data/mm9/bed/NIAGene090903
    cd /cluster/data/mm9/bed
    ln -s NIAGene090903 NIAGene
    cd NIAGene
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-fasta.ff.gz
    wget --timestamp http://lgsun.grc.nia.nih.gov/geneindex/mm9/download/T-psl.txt.gz
    gzip -d *.gz
    
    cut -f 1-21 T-psl.txt >NIAGene.tab
    hgLoadPsl mm9 NIAGene.tab

    mkdir /gbdb/mm9/NIAGene
    ln -s /cluster/data/mm9/bed/NIAGene/T-fasta.fa /gbdb/mm9/NIAGene/T-fasta.fa
    
    hgLoadSeq mm9 /gbdb/mm9/NIAGene/T-fasta.fa

#Creating seq.tab file
#Adding /gbdb/mm9/NIAGene/T-fasta.fa
#257758 sequences
#Updating seq table
#Warning: load of seq did not go as planned: 257758 record(s), 0 row(s) skipped, 257758 warning(s) loading ./seq.tab
#Advisory lock has been released
#All done

# not sure what the warnings are about, but the track seems working.

# Create/edit/check in NIAGene.html and trackDb.ra under
    
        kent/src/hg/makeDb/trackDb/mouse/mm9

#####################################################################
# LASTZ Tetraodon TetNig2 (DONE - 2009-09-15 - Hiram)
    mkdir /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
    cd /hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15

    cat << '_EOF_' > DEF
# mouse vs tetraodon
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Tetraodon TetNig2 - single chunk big enough to single largest item
SEQ2_DIR=/scratch/data/tetNig2/tetNig2.2bit
SEQ2_LEN=/scratch/data/tetNig2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/tetNig2/tetNig2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/tetNig2/tetNig2.contigs.sizes
SEQ2_LIFT=/scratch/data/tetNig2/tetNig2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=50

BASE=/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	about 124 minutes
    cat fb.mm9.chainTetNig2Link.txt 
    #	45642112 bases of 2620346127 (1.742%) in intersection

    #	running the swap
    mkdir /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
    cd /hive/data/genomes/tetNig2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzTetNig2.2009-09-15/DEF \
	-qRepeats=windowmaskerSdust \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    10m34.797s
    cat fb.tetNig2.chainMm9Link.txt 
    #	41176381 bases of 302314788 (13.620%) in intersection

##############################################################################
# BUILD REST TRACK (DONE 9/16/09, Fan)

    mkdir /hive/data/genomes/mm9/bed/REST
    cd /hive/data/genomes/mm9/bed/REST

# Receive bed data file, REST_ChIP_PET_mm9.bed, 
# from Rory JOHNSON [johnsonrb@gis.a-star.edu.sg].

    hgLoadBed mm9 REST REST_ChIP_PET_mm9.bed

# Discovered mm9's extFile and history tables were out of sync.
# Bob and Hirm fixed the problem.  Reload and it was successful.

# Created REST.html based on Rory's original doc and later updates.
# Added track definition and search term into trackDb/mouse/mm9/trackDb.ra

# Fix the 0 base problem. (Fan 9/20/09, per Rory's email)

    hgsql mm9 -e 'update rest set chromStart = chromStart -1'

############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# ADD LINK TO GENENETWORK (DONE. 11/06/09 Fan).

# Received geneNetwork ID list file, GN_mouse_RefSeq.txt, for mm9 from
# GeneNetwork, Zhou Xiaodong [xiaodong.zhou@gmail.com].

    ssh hgwdev
    mkdir -p /cluster/data/mm9/bed/geneNetwork
    cd /cluster/data/mm9/bed/geneNetwork

    hgsql mm9 < ~/src/hg/lib/geneNetworkId.sql
    hgsql mm9 -e \
    'load data local infile "GN_mouse_RefSeq.txt" into table geneNetworkId'

#########################################################################
# LASTZ/CHAIN/NET swap danRer6 (DONE - 2009-12-18 - Galt)
    # original alignment to danRer6
    cd /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17
    cat fb.danRer6.chainMm9Link.txt 
    #   77099032 bases of 1506896106 (5.116%) in intersection

    #	running the swap - DONE - 2009-12-18
    mkdir /hive/data/genomes/mm9/bed/blastz.danRer6.swap
    cd /hive/data/genomes/mm9/bed/blastz.danRer6.swap
    time nice +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/danRer6/bed/lastzMm9.2009-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap >& swap.log &
    #	real    183m21.102s
    cat fb.mm9.chainDanRer6Link.txt 
    #   73444297 bases of 2620346127 (2.803%) in intersection


#######################################################################
# Vega gene update (DONE - 2010-01-15 - Hiram)
    #	lookup version number at the Vega WEB site:
    #	http://vega.sanger.ac.uk/index.html
    #	and FTP site:
    #	ftp://ftp.sanger.ac.uk/pub/vega/
    cd /hive/data/genomes/mm9
    #	step wise to verify operation
    doEnsGeneUpdate.pl -vegaGene -ensVersion=37 -stop=download mm9.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
	-continue=process -stop=process mm9.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
	-continue=load -stop=load mm9.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=37 \
	-continue=cleanup mm9.ensGene.ra
    featureBits mm9 vegaGene
    # 53838752 bases of 2620346127 (2.055%) in intersection
    featureBits mm9 vegaPseudoGene
    # 3060300 bases of 2620346127 (0.117%) in intersection

######################################################################## 
# Blastz Rabbit oryCun2 (DONE - 2010-01-15 - Hiram)
    ssh hgwdev
    screen # use screen to control this job
    mkdir /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
    cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15

    cat << '_EOF_' > DEF
# Mouse vs. Rabbit
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/cluster/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Rabbit at chunk 20,000,000 all but 36 contigs can fit in a single job
SEQ2_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ2_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ2_CTGDIR=/scratch/data/oryCun2/oryCun2.contigs.2bit
SEQ2_CTGLEN=/scratch/data/oryCun2/oryCun2.contigs.sizes
SEQ2_LIFT=/hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
SEQ2_CHUNK=20000000
SEQ2_LIMIT=400
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    cat fb.mm9.chainOryCun2Link.txt
# 670229789 bases of 2620346127 (25.578%) in intersection

    #	496428446 bases of 2620346127 (18.945%) in intersection
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-continue=syntenicNet -bigClusterHub=swarm \
	-syntenicNet > syntenicNet.log 2>&1 &
    #	about 20 minutes

    #	create reciprocal best chains/nets
    ssh hgwdev
    cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
    #	this needs blastz.oryCun2 symlink to function
    time nice -n +19 doRecipBest.pl mm9 oryCun2 > rbest.log 2>&1 &
    #	real    37m32.151s

    mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    84m6.571s
    cat fb.oryCun2.chainMm9Link.txt 
    #	669602734 bases of 2604023284 (25.714%) in intersection

#########################################################################
# ailMel1 Panda alignment (DONE - 2010-02-04 - Hiram)
    mkdir /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
    cd /hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04

    cat << '_EOF_' > DEF
# Mouse vs. Panda
#	parameters from the Panda paper supplemental where they describe
#	their lastz parameters
BLASTZ_K=2200
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_H=2000
BLASTZ_C=2
BLASTZ_T=2

# our usual M
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Panda
SEQ2_DIR=/scratch/data/ailMel1/ailMel1.2bit
SEQ2_LEN=/scratch/data/ailMel1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    501m27.760s
    cat fb.mm9.chainAilMel1Link.txt 
    #	749595031 bases of 2620346127 (28.607%) in intersection

    mkdir /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
    cd /hive/data/genomes/ailMel1/bed/blastz.mm9.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzAilMel1.2010-02-04/DEF \
	-swap -noLoadChainSplit -bigClusterHub=swarm -smallClusterHub=memk \
	-workhorse=hgwdev \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    54m57.140s
    cat fb.ailMel1.chainMm9Link.txt 
    #	739076250 bases of 2245312831 (32.916%) in intersection

############################################################################
# susScr1 Pig BLASTZ/CHAIN/NET (DONE - 2010-01-21,22 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
    cd /hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21

    cat << '_EOF_' > DEF
# Pig vs. Mouse
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pig SusScr1
SEQ2_DIR=/scratch/data/susScr1/susScr1.2bit
SEQ2_LEN=/scratch/data/susScr1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    875m26.114s
    cat fb.mm9.chainSusScr1Link.txt 
    #	616833828 bases of 2620346127 (23.540%) in intersection

    mkdir /hive/data/genomes/susScr1/bed/blastz.mm9.swap
    cd /hive/data/genomes/susScr1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzSusScr1.2010-01-21/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=pk \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    69m27.221s
    cat fb.susScr1.chainMm9Link.txt 
    #	656445475 bases of 2231332019 (29.419%) in intersection

#########################################################################
# CRG MAPABILITY (2010-02-05 - 2010-02-09, hartera, DONE)
# Data was provided by Thomas Derrien (thomas.derrien.crg.es) and Paolo Ribeca
# from the Guigo lab at the Center for Genomic Regulation (CRG) in Barcelona
# on 2010-02-04.
# Data was produced using their GEM mapper aligner taking sliding k-mers 
# window of the human genome that were mapped back onto the genome with up 
# to 2mismatches. For each window, a mappability score is computed 
# S = 1/(nb of match_found) and the BigWig index was created according to 
# this score.
# 2010-02-09. Loaded database and added data to /gbdb/
# Added trackDb entry for the Mapability track.
# 2010-04-02. Replaced the Mapability 40mer subtrack bigWig file with a new one
# provided by CRG as the old file had regions with missing data.
# 2010-04-28. Received new data from Thomas Derrien. Downloaded data and 
# added it to /gbdb/. A bug was found in a library used by bedGraphToBigWig so
# sent a new binary to data providers and they re-created the bigWig files. 
     mkdir -p /hive/data/genomes/mm9/bed/crgMapability
     cd /hive/data/genomes/mm9/bed/crgMapability
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bw.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.bz2
'EOF'

     awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
         temp > download.csh
     rm temp
     chmod +x download.csh
     ./download.csh >& download.log &
     
     # Add the data to /gbdb/ and load the file names into tables (2010-01-26)
     cd /hive/data/genomes/mm9/bed/crgMapability
     bunzip2 *.bz2
     # Add data to gbdb
     mkdir -p /gbdb/mm9/bbi/
     # Symlink files with names as crgMapabilityAlignXmer.bw to /gbdb/mm9/bbi
     # and load file name into a table - one per dataset. Each table 
     # represents a subtrack.
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "_" -f1`
        set mer=`echo "${num}mer"`
        set nf=`echo "crgMapabilityAlign${mer}.bw"`
        echo $nf
        ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf}
        hgsql mm9 -e "drop table if exists crgMapabilityAlign${mer}; \
     create table crgMapabilityAlign${mer} (fileName varchar(255) not null); \
     insert into crgMapabilityAlign${mer} values ('/gbdb/mm9/bbi/${nf}');"
     end

     # Added a trackDb entry for this mapability track in
     # kent/src/hg/makeDb/trackDb/mouse/mm9/trackDb.ra
     # use bigWigInfo to check min and max values. Created a mapability.html
     # description page.
     
     # 2010-04-02, hartera
     # QA found regions of missing data for the 40mer subtrack. Wrote to the 
     # data providers and they said that the original output has no missing
     # data so they recreated the bigWig file for the 40mer subtrack and a 
     # link to the new file was sent on 2010-04-02. 
     cd /hive/data/genomes/mm9/bed/crgMapability
     wget --timestamping \
"http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bw.gz"
     gunzip M.musculus.genome.mm9.mappability-40_mm9.bw.gz
     # Remove old file from /gbdb/mm9/bbi and add new file.
     rm /gbdb/mm9/bbi/crgMapabilityAlign40mer.bw
     ln -s `pwd`/M.musculus.genome.mm9.mappability-40_mm9.bw \
         /gbdb/mm9/bbi/crgMapabilityAlign40mer.bw

     # Downloaded and added new bigWig files to /gbdb/hg19/bbi 
     # (2010-04-30 and 2010-05-01, hartera). New files were created as 
     # there was a bug in the older version of bedGraphToBigWig.          
     cd /hive/data/genomes/mm9/bed/crgMapability
     rm temp download.csh download.log 
cat << 'EOF' > temp
#!/bin/tcsh -ef
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-100_mm9.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-36_mm9.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-40_mm9.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-50_mm9.bz2
http://genome.crg.es/~tderrien/UCSC_Tracks/M.musculus.genome.mm9.mappability-75_mm9.bz2
'EOF'
     awk '{if ($0 ~ /#/) print; else print "wget --timestamping \"" $0 "\"";}' \
         temp > download.csh
     rm temp
     chmod +x download.csh
     ./download.csh >& download.log &

     # Add data to /gbdb/. The file names in /gbdb/ are the same as before 
     # so the tables do not need to be reloaded.
     cd /hive/data/genomes/mm9/bed/crgMapability
     bunzip2 *.bz2
     # File names do not have a *.bw extension so re-name
     foreach f (`ls *mm9`)
        echo $f
        set g=${f}.bw
        echo $g
        mv $f $g
     end
     # Then symlink to /gbdb/
     foreach f (`ls *.bw`)
        echo $f
        set g=`echo $f | cut -d "-" -f2`
        set num=`echo $g | cut -d "_" -f1`
        set mer=`echo "${num}mer"`
        set nf=`echo "crgMapabilityAlign${mer}.bw"`
        echo $nf
        rm /gbdb/mm9/bbi/${nf}
        ln -s `pwd`/${f} /gbdb/mm9/bbi/${nf}
     end

#####################################################################
# tRNAs track (2010-03-12, Fan RE-BUILT)
#
    ssh hgwdev
    cd /hive/data/genomes/mm9/bed
    mkdir tRNAs
    cd tRNAs

# Get data files from /projects/lowelab/users/lowe/Browser/vertebrates/

    cp -p /projects/lowelab/users/lowe/Browser/vertebrates/mm9-tRNAs.bed .
    cp -p \
    /projects/lowelab/users/lowe/Browser/vertebrates/mm9_tRNAs_images.tar .

    hgsql mm9 -e 'drop table if exists tRNAs'
    hgLoadBed -tab mm9 tRNAs mm9-tRNAs.bed -sqlTable=$HOME/kent/src/hg/lib/tRNAs.sql

    mkdir gif
    cd gif
    tar -xvf ../mm9_tRNAs_images.tar
    mv images/*.gif .
    rm -rf images
    mkdir /hive/data/gbdb/mm9/RNA-img
    rm /hive/data/gbdb/mm9/RNA-img/*
    cp -p * /hive/data/gbdb/mm9/RNA-img

#####################################################################
# LASTZ/CHAIN/NET Marmoset calJac3 (DONE - 2010-02-12 - Hiram)
    #	use a screen to control this job
    screen
    mkdir /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
    cd /hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12

    cat << '_EOF_' > DEF
# mouse vs marmoset
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Marmoset (calJac3)
SEQ2_DIR=/scratch/data/calJac3/calJac3.2bit
SEQ2_LEN=/scratch/data/calJac3/chrom.sizes
SEQ2_LIMIT=75
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 $HOME/kent/src/hg/utils/automation/doBlastzChainNet.pl \
	-verbose=2 `pwd`/DEF \
	-syntenicNet -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    445m42.381s
    cat fb.mm9.chainCalJac3Link.txt 
    #	859869647 bases of 2620346127 (32.815%) in intersection

    mkdir /hive/data/genomes/calJac3/bed/blastz.mm9.swap
    cd /hive/data/genomes/calJac3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzCalJac3.2010-02-12/DEF \
	-swap -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    90m38.739s
    cat fb.calJac3.chainHg19Link.txt 
    #	861811978 bases of 2752505800 (31.310%) in intersection

#######################################################################
# felCat4 Cat BLASTZ/CHAIN/NET (DONE  - 2010-06-07 - Chin)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07
    cd /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07

    cat << '_EOF_' > DEF
# dog vs. cat
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Dog canFan3
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=20000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Cat (felCat4)
SEQ2_DIR=/scratch/data/felCat4/felCat4.2bit
SEQ2_LEN=/scratch/data/felCat4/chrom.sizes
SEQ2_LIMIT=50
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet -noDbNameCheck \
        -chainMinScore=3000 -chainLinearGap=medium \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=pk \
        > do.log 2>&1 &
    # real    1272m46.726s

    # doBlastzChainNet from step chainRun after para stop, para freeBatch 
    # After para stop para freeBatch in 
    # /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07/axtChain/run] 
    # rm the run directory,  and use memk/swarm this time

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
         -continue chainRun \
         -syntenicNet -noDbNameCheck \
         -chainMinScore=3000 -chainLinearGap=medium \
         -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
         > do_chainRun.log 2>&1 &
    # real    337m31.606s
    #  *** All done !  Elapsed time: 337m32s
    #  *** Make sure that goldenPath/mm9/vsFelCat4/README.txt is accurate.
    # *** Add {chain,net}FelCat4 tracks to trackDb.ra if necessary.

    cat fb.mm9.chainFelCat4Link.txt
    #   637007193 bases of 2620346127 (24.310%) in intersection

    # swap 
    mkdir /hive/data/genomes/felCat4/bed/blastz.mm9.swap
    cd /hive/data/genomes/felCat4/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm9/bed/lastzFelCat4.2010-06-07/DEF \
        -swap -syntenicNet -noDbNameCheck \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    # real    176m42.490s
    # *** All done !  Elapsed time: 176m42s
    # *** Make sure that goldenPath/felCat4/vsMm9/README.txt is accurate.
    # *** Add {chain,net}Mm9 tracks to trackDb.ra if necessary.

    #   real   ???? 125m37.926s 
    cat fb.felCat4.chainMm9Link.txt
    #   616529959 bases of 1990635005 (30.972%) in intersection


#####################################################################
# susScr2 Pig BLASTZ/CHAIN/NET (DONE - 2010-03-26,27 - Hiram)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26
    cd /hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26

    cat << '_EOF_' > DEF
# Pig vs. Mouse
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Pig SusScr2
SEQ2_DIR=/scratch/data/susScr2/susScr2.2bit
SEQ2_LEN=/scratch/data/susScr2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	Elapsed time: 717m25s
    cat fb.mm9.chainSusScr2Link.txt 
    #	616615408 bases of 2620346127 (23.532%) in intersection

    mkdir /hive/data/genomes/susScr2/bed/blastz.mm9.swap
    cd /hive/data/genomes/susScr2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzSusScr2.2010-03-26/DEF \
	-swap -noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	 Elapsed time: 63m4s
    cat fb.susScr2.chainMm9Link.txt 
    #	656444411 bases of 2231298548 (29.420%) in intersection

############################################################################
# Vega aka Havana gene update (DONE - 2010-04-07 - Hiram)
    #	Stephen Trevanion st3 sanger ac uk
    #	lookup version number at the Vega WEB site:
    #	http://vega.sanger.ac.uk/index.html
    #	and FTP site:
    #	ftp://ftp.sanger.ac.uk/pub/vega/
    cd /hive/data/genomes/mm9
    #	step wise to verify operation
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 -stop=download mm9.ensGene.ra
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=process -stop=process mm9.ensGene.ra
# genePredCheck -db=mm9 vegaPseudo.gp.gz
# checked: 4377 failed: 0
# genePredCheck -db=mm9 not.vegaPseudo.gp.gz
# checked: 57096 failed: 0
# genePredCheck -db=mm9 mm9.allGenes.gp.gz
# checked: 61473 failed: 0
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=load -stop=load mm9.ensGene.ra
    #	"identical to previous version 37"
    doEnsGeneUpdate.pl -vegaGene -ensVersion=38 \
	-continue=cleanup mm9.ensGene.ra
    featureBits mm9 vegaGene
    # 53838752 bases of 2620346127 (2.055%) in intersection
    featureBits mm9 vegaPseudoGene
    # 3060300 bases of 2620346127 (0.117%) in intersection

#####################################################################
# oviAri1 Sheep BLASTZ/CHAIN/NET (DONE - 2010-04-16 - Chin)
    screen # use a screen to manage this multi-day job
    mkdir /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16
    cd /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16

    cat << '_EOF_' > DEF
# Sheep vs. Mouse
BLASTZ_M=50

# TARGET: Mouse Mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Sheep OviAri1
SEQ2_DIR=/scratch/data/oviAri1/oviAri1.2bit
SEQ2_LEN=/scratch/data/oviAri1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16
TMPDIR=/scratch/tmp
'_EOF_'
    # << this line keeps emacs coloring happy

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #   real    269m58.488s
    cat fb.mm9.chainOviAri1Link.txt
    #   406407377 bases of 2620346127 (15.510%) in intersection

    #   and the swap
    mkdir /hive/data/genomes/oviAri1/bed/blastz.mm9.swap
    cd /hive/data/genomes/oviAri1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm9/bed/lastzOviAri1.2010-04-16/DEF \
        -swap -noLoadChainSplit -syntenicNet \
        -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #   real    35m25.217s

    cat fb.oviAri1.chainMm9Link.txt 
    #   383753361 bases of 1201271277 (31.946%) in intersection

#######################################################################
#############################################################################
# ucscRetro track (2010-04-12, baertsch DONE)
mkdir -p /hive/users/baertsch/retro/mm9
cd /hive/users/baertsch/retro/mm9
wget http://compbio.soe.ucsc.edu/retrogene/retroFinder-1.16.tar.gz
tar xvfz retroFinder-1.16.tar.gz
cd retroFinder-1.16/src/pslPseudo
make
cd ../../..


cat << '_EOF_' > DEF
RETRO_OPTIONS="-verbose=4 -minAli=0.98 -nearTop=0.005 "
DB=mm9
SCORETHRESH=550
GENOMENAME='Mus musculus'
GBDB=mm
MRNABASE=/hive/data/genomes/$DB/bed/mrnaBlastz/
TMPMRNA=/hive/users/baertsch/mrnaBlastz/$DB
TMPEST=/hive/users/baertsch/est/$DB
EST=all_est
SPLICED_EST=intronEst
SPLIT_EST=0
SPLIT_SPLICED_EST=1
SCRIPT=/hive/users/baertsch/retro/$DB/retroFinder-1.16/scripts
GENOME=/hive/data/genomes/
RETRODIR=$GENOME/$DB/bed/retro
BASE=/hive/users/baertsch/retro
OUTDIR=/hive/users/baertsch/retro/$DB/
RESULT=$OUTDIR/result
LOG=$OUTDIR/log
OUT=$OUTDIR/out
OVERLAPDIR=$OUTDIR/run.o
VERSION=2
TABLE=ucscRetroInfo$VERSION
ALIGN=ucscRetroAli$VERSION
LOCAL=/scratch/data/$DB
NIB=$LOCAL/nib
RMSK=x
NET1=netHg19 
NET2=netCanFam2 
NET3=netRn4 
GENE1=knownGene
GENE2=refGene
GENE3=ensGene
CLUSTER=swarm
SPECIES="hg18 mm9"
ROOTDIR="~/public_html/retro/mm9Nov09"
EXPDIR=exp
GENEPFAM=knownGene
PFAM=knownToPfam
PFAMIDFIELD=name
PFAMDOMAIN=value
ARRAY=gnfAtlas2
AFFYPROBE=affyGnf1m
ARRAYMEDIAN=hgFixed.gnfMouseAtlas2Median 
ARRAYRATIO=hgFixed.gnfMouseAtlas2AllRatio
ARRAYABS=hgFixed.gnfMouseAtlas2All
ARRAYEXP=hgFixed.gnfMouseAtlas2MedianExps 
ARRAYEXPALL=hgFixed.gnfMouseAtlas2AllExps
ARRAYLOOKUP=knownToGnfAtlas2 
ARRAYPSLS="/hive/data/genomes/mm9/bed/geneAtlas2/affyGnf1m.psl"
ALTSPLICE=sibTxGraph
SPLITBYAGE=splitRetrosByAgeMouse
PDB=proteins090821
'_EOF_'
    # << happy emacs

#add ./retroFinder-1.16/scripts to PATH
retroFinder-1.16/scripts/filterMrna.sh DEF
retroFinder-1.16/scripts/filterEst.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep1.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep2.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep3.sh DEF
#check cluster job
nohup retroFinder-1.16/scripts/ucscRetroStep4.sh DEF
nohup retroFinder-1.16/scripts/ucscRetroStep5.sh DEF
    # Load the track
nohup retroFinder-1.16/scripts/ucscRetroStep6.sh DEF
#add ucscRetroAli to trackDb.ra 
################################################################
# ADD KEGG TABLES (DONE, Fan, 6/18/10)

mkdir -p /hive/data/genomes/mm9/bed/pathways/kegg
cd /hive/data/genomes/mm9/bed/pathways/kegg

wget --timestamping ftp://ftp.genome.jp/pub/kegg/pathway/map_title.tab

cat map_title.tab | sed -e 's/\t/\tmmu\t/' > j.tmp
cut -f 2 j.tmp >j.mmu
cut -f 1,3 j.tmp >j.1
paste j.mmu j.1 |sed -e 's/\t//' > keggMapDesc.tab
rm j.mmu j.1
rm j.tmp

hgsql mm9 -e 'drop table keggMapDesc'
hgsql mm9 < ~/kent/src/hg/lib/keggMapDesc.sql
hgsql mm9 -e 'load data local infile "keggMapDesc.tab" into table keggMapDesc'

wget --timestamping ftp://ftp.genome.jp/pub/kegg/genes/organisms/mmu/mmu_pathway.list

cat mmu_pathway.list| sed -e 's/path://'|sed -e 's/:/\t/' > j.tmp
hgsql mm9 -e 'drop table keggPathway'
hgsql mm9 < ~/kent/src/hg/lib/keggPathway.sql
hgsql mm9 -e 'load data local infile "j.tmp" into table keggPathway'

hgsql mm9 -N -e \
'select name, locusID, mapID from keggPathway p, knownToLocusLink l where p.locusID=l.value' \
>keggPathway.tab

hgsql mm9 -e 'delete from keggPathway'

hgsql mm9 -e 'load data local infile "keggPathway.tab" into table keggPathway'

rm j.tmp

################################################################
# Add KEGG column to mm9 Gene Sorter (Done, Fan, 6/18/2010)

mkdir -p /hive/data/genomes/mm9/bed/geneSorter
cd /hive/data/genomes/mm9/bed/geneSorter
hgsql mm9 -N -e 'select kgId, mapID, mapID, "+", locusID from keggPathway' |sort -u|sed -e 's/\t+\t/+/' > knownToKeggEntrez.tab

hgsql mm9 -e 'drop table knownToKeggEntrez'

hgsql mm9 < ~/kent/src/hg/lib/knownToKeggEntrez.sql

hgsql mm9 -e 'load data local infile "knownToKeggEntrez.tab" into table knownToKeggEntrez'

#############################################################################
#  Update BLASTTAB blast tables 	(DONE - 2010-08-06 - Fan)
    ssh hgwdev
    mkdir -p /hive/data/genomes/mm9/bed/hgNearBlastp/100806
    cd /hive/data/genomes/mm9/bed/hgNearBlastp/100806
    # Get the proteins used by all hgNear organisms:
    pepPredToFa hg19 knownGenePep hg19.known.faa
    pepPredToFa mm9 knownGenePep mm9.known.faa
    pepPredToFa rn4 knownGenePep rn4.known.faa
    pepPredToFa danRer6 ensPep danRer6.ensPep.faa
    pepPredToFa dm3 flyBasePep dm3.flyBasePep.faa
    pepPredToFa ce6 sangerPep ce6.sangerPep.faa
    pepPredToFa sacCer2 sgdPep sacCer2.sgdPep.faa

    cat << '_EOF_' > config.ra
# Latest mouse vs. other Gene Sorter orgs:
# human, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix known
targetDb mm9
queryDbs hg19 rn4 danRer6 dm3 ce6 sacCer2
recipBest         danRer6 dm3 ce6 sacCer2

mm9Fa     /hive/data/genomes/mm9/bed/hgNearBlastp/100806/mm9.known.faa
hg19Fa    /hive/data/genomes/mm9/bed/hgNearBlastp/100806/hg19.known.faa
rn4Fa     /hive/data/genomes/mm9/bed/hgNearBlastp/100806/rn4.known.faa
danRer6Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/danRer6.ensPep.faa
dm3Fa     /hive/data/genomes/mm9/bed/hgNearBlastp/100806/dm3.flyBasePep.faa
ce6Fa     /hive/data/genomes/mm9/bed/hgNearBlastp/100806/ce6.sangerPep.faa
sacCer2Fa /hive/data/genomes/mm9/bed/hgNearBlastp/100806/sacCer2.sgdPep.faa

buildDir /hive/data/genomes/mm9/bed/hgNearBlastp/100806
scratchDir /hive/data/genomes/mm9/bed/hgNearBlastp/100806/tmp
'_EOF_'

doHgNearBlastp.pl -targetOnly config.ra >& do.log & tail -f do.log

# *** All done!
# *** Check these tables in mm9:
# *** knownBlastTab hgBlastTab rnBlastTab drBlastTab dmBlastTab ceBlastTab scBlastTab 
#########################################################################
# BUILD CGAP PATHWAY TABLES, DONE, Fan 7/6/2010

    ssh hgwdev
    mkdir -p /hive/data/genomes/mm9/bed/cgap/100706
    cd /hive/data/genomes/mm9/bed/cgap/100706
    
# get data file from data source
    wget --timestamping -O Mm_GeneData.dat "ftp://ftp1.nci.nih.gov/pub/CGAP/Mm_GeneData.dat"

# parse the data file
    hgCGAP Mm_GeneData.dat
    
    hgsql mm9 -e "drop table cgapBiocPathway"
    hgsql mm9 -e "drop table cgapBiocDesc"
    hgsql mm9 -e "drop table cgapAlias"

    hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapBiocPathway.sql
    hgsql mm9 -e 'LOAD DATA local INFILE "cgapBIOCARTA.tab" into table cgapBiocPathway;'
   
    hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapBiocDesc.sql
    cat cgapBIOCARTAdesc.tab|sort -u > cgapBIOCARTAdescSorted.tab
    hgsql mm9 -e 'LOAD DATA local INFILE "cgapBIOCARTAdescSorted.tab" into table cgapBiocDesc;'
    
    hgsql mm9 < ~/kent/src/hg/hgCGAP/cgapAlias.sql
    cat cgapSEQUENCE.tab cgapSYMBOL.tab cgapALIAS.tab|sort -u > cgapAlias.tab
    hgsql mm9 -e 'LOAD DATA local INFILE "cgapAlias.tab" into table cgapAlias'

#########################################################################
# phyloP conservation for 30-way (DONE - 2010-07-15 - Hiram)
#
# Vertebrate, Placental, Euarchontoglires
#
    # split SS files into 1M chunks, this business needs smaller files
    #   to complete

    ssh swarm
    mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP
    mkdir ss run.split
    cd run.split

    cat << '_EOF_' > doSplit.csh
#!/bin/csh -ef
set c = $1
set MAF = /hive/data/genomes/mm9/bed/multiz30way/maf.split/$c.maf
set WINDOWS = /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.split/ss/$c
set WC = `cat $MAF | wc -l`
set NL = `grep "^#" $MAF | wc -l`
if ( -s $2 ) then
    exit 0
endif
if ( -s $2.running ) then
    exit 0
endif

date >> $2.running

rm -fr $WINDOWS
mkdir $WINDOWS
pushd $WINDOWS > /dev/null
if ( $WC != $NL ) then
/cluster/bin/phast.build/cornellCVS/phast.2009-10-19/bin/msa_split \
    $MAF -i MAF -o SS -r $WINDOWS/$c -w 1000000,0 -I 1000 -B 5000
endif
popd > /dev/null
date >> $2
rm -f $2.running
'_EOF_'
# << happy emacs
    chmod +x doSplit.csh

    ls -1S -r ../../maf.split | sed -e "s/.maf//" > maf.list

    cat << '_EOF_' > template
#LOOP
doSplit.csh $(path1) {check out exists+ done/$(path1).done}
#ENDLOOP
'_EOF_'
# << happy emacs

    mkdir ss done
    ssh memk
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.split
    gensub2 maf.list single template jobList
    para -ram=8g create jobList
# Completed: 75 of 75 jobs
# CPU time in finished jobs:       9843s     164.05m     2.73h    0.11d  0.000 y
# IO & Wait Time:                  2938s      48.97m     0.82h    0.03d  0.000 y
# Average job time:                 170s       2.84m     0.05h    0.00d
# Longest finished job:             393s       6.55m     0.11h    0.00d
# Submission to last job:           678s      11.30m     0.19h    0.01d

    # run phyloP with --method LRT 
    ssh swarm
    mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.phyloP
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/run.phyloP

    # Adjust model file base composition background and rate matrix to be
    # representative of the chromosomes in play
    grep BACKGROUND ../../cons/all/all.mod | awk '{printf "%0.3f\n", $3 + $4}'
    #	0.410
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/all/all.mod 0.410 > all.mod
    grep BACKGROUND ../../cons/euarchontoglires/euarchontoglires.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.410
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/euarchontoglires/euarchontoglires.mod 0.410 \
	> euarchontoglires.mod
    grep BACKGROUND ../../cons/placental/placental.mod \
	| awk '{printf "%0.3f\n", $3 + $4}'
    #	0.410
    /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin/modFreqs \
	../../cons/placental/placental.mod 0.410 > placental.mod

    cat << '_EOF_' > doPhyloP.csh
#!/bin/csh -fe
set PHASTBIN = /cluster/bin/phast.build/cornellCVS/phast.2009-10-21/bin
set f = $1
set out = $2
set cName = $f:r:r
set chrDir = $f:r
set n = $f:r:e
set grp = $cwd:t
set cons = /hive/data/genomes/mm9/bed/multiz30way/consPhyloP
set tmp = $cons/tmp/$grp/$f
rm -fr $tmp
mkdir -p $tmp
set ssSrc = "$cons/run.split/ss/$chrDir/$f"
set useGrp = "$grp.mod"
ln -s $cons/run.phyloP/$grp.mod $tmp
pushd $tmp > /dev/null
$PHASTBIN/phyloP --method LRT --mode CONACC --wig-scores --chrom $cName \
    -i SS $useGrp $ssSrc.ss > $f.wigFix
popd > /dev/null
mkdir -p $out:h
sleep 4
mv $tmp/$f.wigFix $out
rm -fr $tmp
'_EOF_'
    # << happy emacs
    chmod +x doPhyloP.csh

    # Create list of chunks
    find ../run.split/ss -type f | sed -e "s/.ss$//; s#^../run.split/ss/##" \
	> ss.list

    # Create template file
    #	file1 == $chr/$chunk/file name without .ss suffix
    cat << '_EOF_' > template
#LOOP
../run.phyloP/doPhyloP.csh $(file1) {check out line+ wigFix/$(dir1)/$(file1).wigFix}
#ENDLOOP
'_EOF_'
    # << happy emacs

    ######################   Running all species  #######################
    # setup run for all species
    mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/all
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/all
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...
    para time
# Completed: 2685 of 2685 jobs
# CPU time in finished jobs:     641900s   10698.34m   178.31h    7.43d  0.020 y
# IO & Wait Time:                 19012s     316.86m     5.28h    0.22d  0.001 y
# Average job time:                 246s       4.10m     0.07h    0.00d
# Longest finished job:             484s       8.07m     0.13h    0.01d
# Submission to last job:         31192s     519.87m     8.66h    0.36d

    ssh hgwdev
    cd /hive/data/genomes/hg18/bed/multiz30way/consPhyloP/run.phyloP/all
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigEncode stdin phyloP30way.wig \
		phyloP30way.wib  > wigEncode.log 2>&1 &
    #	Converted stdin, upper limit 5.04, lower limit -10.12

    #	good test to make sure no overlapping coordinates, bigWig:
    #	consumes massive amount of memory, in bash raise your memory limits:
    ulimit -d 188743680
    ulimit -v 188743680
    time cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigToBigWig stdin ../../../../chrom.sizes phyloP30way.bw &

    #	if you wanted to use the bigWig file, loading bigWig table:
    ln -s `pwd`/phyloP30way.bw /gbdb/mm9/bbi
    hgsql mm9 -e 'drop table if exists phyloP30wayAll; \
            create table phyloP30wayAll \
		(fileName varchar(255) not null); \
            insert into phyloP30wayAll values
	("/gbdb/mm9/bbi/phyloP30way.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP30way.wib /gbdb/mm9/multiz30way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phyloP30wayAll phyloP30way.wig
    #	real    0m32.778s

    #	create download files:
    cat << '_EOF_' > mkDown.csh
#!/bin/csh -fe
foreach F (`cat wigFile.list`)
    set C = $F:h:t:r
    cat $F | sed -e "s/__[0-9]//" >> downloads/${C}.wigFix
end
'_EOF_'
    # << happy emacs
    chmod +x ./mkDown.csh
    mkdir downloads
    time ./mkDown.csh
    #	real    16m19.683s

    time gzip downloads/chr*.wigFix
    #	real    47m11.017s

    wigTableStats.sh mm9 phyloP30wayAll
# db.table      min max mean count sumData
# mm9.phyloP30wayAll      -10.116 5.038 0.119587 1914580285 2.28959e+08
#	stdDev viewLimits
#	0.760605 viewLimits=-3.68344:3.92261
    #	that range is: 10.116+5.039 = 15.154

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.015154 -hBinCount=1000 -hMinVal=-10.116 -verbose=2 \
	    -db=mm9 phyloP30wayAll > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP30way track, all 30 vertebrates"
set xlabel " phyloP30way score, all 30 vertebrates"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.04]
set xrange [-2:2]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

    ######################   Running the euarchontoglires  #######################
    mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...
    para time
# Completed: 2685 of 2685 jobs
# CPU time in finished jobs:     127142s    2119.04m    35.32h    1.47d  0.004 y
# IO & Wait Time:                 53995s     899.91m    15.00h    0.62d  0.002 y
# Average job time:                  67s       1.12m     0.02h    0.00d
# Longest finished job:             125s       2.08m     0.03h    0.00d
# Submission to last job:           277s       4.62m     0.08h    0.00d


    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/euarchontoglires
    #	the sed | sort | sed
    #	trick gets the files sorted so that coordinates and chromosomes
    #	are in chrom and chromStart order and thus wigEncode sees a proper
    #	incoming data stream sorted by coordinates.
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigEncode stdin phyloP30wayEuarchontoglires.wig \
		phyloP30wayEuarchontoglires.wib  > wigEncode.log 2>&1 &
    cat wigEncode.log
    #	Converted stdin, upper limit 1.13, lower limit -7.49

    #	good test to make sure no overlapping coordinates, bigWig:
    #	consumes massive amount of memory, in bash raise your memory limits:
    ulimit -d 188743680
    ulimit -v 188743680
    cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigToBigWig stdin ../../../../chrom.sizes \
	    phyloP30wayEuarchontoglires.bw
# XXX running Fri Jul  9 14:33:29 PDT 2010
    #	if you wanted to use the bigWig file, loading bigWig table:
    ln -s `pwd`/phyloP30wayEuarchontoglires.bw /gbdb/mm9/bbi
    hgsql mm9 -e 'drop table if exists phyloP30wayEuarchontoglires; \
            create table phyloP30wayEuarchontoglires \
		(fileName varchar(255) not null); \
            insert into phyloP30wayEuarchontoglires values
	("/gbdb/mm9/bbi/phyloP30wayEuarchontoglires.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP30wayEuarchontoglires.wib /gbdb/mm9/multiz30way
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phyloP30wayEuarch phyloP30wayEuarchontoglires.wig
    #	real    0m39.041s
    #	clean garbage:
    rm -f wiggle.tab

    #	create download files:
    mkdir downloads
    time ../all/mkDown.csh
    #	real    18m44.186s
    time gzip downloads/chr*.wigFix
    #	real    32m11.301s

    wigTableStats.sh mm9 phyloP30wayEuarchontoglires
# db.table      min max mean count
# mm9.phyloP30wayEuarchontoglires -7.486 1.126 0.0662017 1914580285
#	1.26749e+08 0.594433 viewLimits=-2.90596:1.126
#	that range is: 7.486+1.126 = 8.612

    #  Create histogram to get an overview of all the data, using the
    #	numbers from wigTableStats above:
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.008612 -hBinCount=1000 -hMinVal=-7.486 -verbose=2 \
	    -db=mm9 phyloP30wayEuarchontoglires > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small size 1000,600 x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP30wayEuarchontoglires track"
set xlabel " phyloP30wayEuarchontoglires score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0.0:0.07]
set xrange [-2.0:1.13]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

######################   Running the placental  #######################
    mkdir /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental
    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental
    rm -fr wigFix
    mkdir wigFix

    gensub2 ../run.phyloP/ss.list single ../run.phyloP/template jobList
    para create jobList
    para try ... check ... push ... etc ...
    para time
# Completed: 2685 of 2685 jobs
# CPU time in finished jobs:     237516s    3958.60m    65.98h    2.75d  0.008 y
# IO & Wait Time:                 45828s     763.80m    12.73h    0.53d  0.001 y
# Average job time:                 106s       1.76m     0.03h    0.00d
# Longest finished job:             196s       3.27m     0.05h    0.00d
# Submission to last job:           426s       7.10m     0.12h    0.00d

    cd /hive/data/genomes/mm9/bed/multiz30way/consPhyloP/placental
    find ./wigFix -type f \
	| sed -e "s#^./##; s/\./ /g; s/-/ - /g" \
	| sort -k1,1 -k2,2n | sed -e "s/ - /-/g; s/ /./g" > wigFile.list
    cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigEncode stdin phyloP30wayPlacental.wig phyloP30wayPlacental.wib \
	    > wigEncode.log 2>&1 &
    #	Converted stdin, upper limit 2.06, lower limit -9.46


    #	good test to make sure no overlapping coordinates, bigWig:
    #	consumes massive amount of memory, in bash raise your memory limits:
    ulimit -d 188743680
    ulimit -v 188743680
    cat wigFile.list | xargs cat | sed -e "s/__[0-9]//" \
	| wigToBigWig stdin ../../../../chrom.sizes phyloP30wayPlacental.bw \
	    > bigEncode.log 2>&1 &

    #	loading bigWig table:
    ln -s `pwd`/phyloP30wayPlacental.bw /gbdb/mm9/bbi
    hgsql mm9 -e 'drop table if exists phyloP30wayPlacental; \
            create table phyloP30wayPlacental \
		(fileName varchar(255) not null); \
            insert into phyloP30wayPlacental values
	("/gbdb/mm9/bbi/phyloP30wayPlacental.bw");'

    #	loading the wiggle table:
    ln -s `pwd`/phyloP30wayPlacental.wib /gbdb/mm9/multiz30way
    time hgLoadWiggle
    time nice -n +19 hgLoadWiggle -pathPrefix=/gbdb/mm9/multiz30way mm9 \
	phyloP30wayPlacental phyloP30wayPlacental.wig
    #	real    0m35.726s

    #	create download files:
    mkdir downloads
    time ../all/mkDown.csh
    #	real    18m52.778s
    time gzip downloads/chr*.wigFix
    #	real    30m55.550s

    wigTableStats.sh mm9 phyloP30wayPlacental
# db.table      min max mean count sumData stdDev viewLimits
# mm9.phyloP30wayPlacental        -9.46 2.058 0.07797 1914580285 1.4928e+08
#	stdDev viewLimits
#	 0.668819 viewLimits=-3.26613:2.058
    #	that range is: 9.46+2.058 = 11.518

    #  Create histogram to get an overview of all the data
    time nice -n +19 hgWiggle -doHistogram \
	-hBinSize=0.011518 -hBinCount=1000 -hMinVal=-9.46 -verbose=2 \
	    -db=mm9 phyloP30wayPlacental > histogram.data 2>&1
    #	real    8m15.623s

    #	create plot of histogram:

    cat << '_EOF_' | gnuplot > histo.png
set terminal png small color x000000 xffffff xc000ff x66ff66 xffff00 x00ffff
set size 1.4, 0.8
set key left box
set grid noxtics
set grid ytics
set title " Human Hg19 Histogram phyloP30wayPlacental track"
set xlabel " phyloP30wayPlacental score"
set ylabel " Relative Frequency"
set y2label " Cumulative Relative Frequency (CRF)"
set y2range [0:1]
set y2tics
set yrange [0:0.03]
set xrange [-2.5:2.5]

plot "histogram.data" using 2:5 title " RelFreq" with impulses, \
        "histogram.data" using 2:7 axes x1y2 title " CRF" with lines
'_EOF_'
    #	<< happy emacs

    display histo.png &

#############################################################################
# Agilent arrays (2010-12-01 Andy)
cd /hive/data/genomes/mm9/bed/agilentProbes/
# FTP download from ftp.agilent.com using given user/pass from Anniek De-witte
# (anniek_de-witte@agilent.com)
# downloaded files are gzipped beds. The files are typically located in a 
# directory called "FOR_UCSC" or something like that.  The user/pass and the
# directory are deleted after it's confirmed they're received, so it's not
# too helpful to mention specifics here.
ftp -u user -p password ftp.agilent.com
> cd directory
> get 027411_D_BED_20100308.bed.gz
> get 027414_D_BED_20100318.bed
# unzip everything
gunzip 027*.bed.gz
ln -s 027414_D_BED_20100318.bed agilentCgh1x1m.ct.bed
ln -s 027411_D_BED_20100308.bed agilentCgh4x180k.ct.bed
for bed in agilent*.bed; do
    tail -n +2 $bed | hgLoadBed mm9 ${bed%.ct.bed} stdin
done
rm bed.tab


##########################################################################
# Build targetScanS track - (DONE - 2010-12-14 galt)
#       requested by: George Bell gbell at wi.mit.edu
    ssh hgwdev
    mkdir -p /cluster/data/mm9/bed/targetScanS
    cd /cluster/data/mm9/bed/targetScanS

    wget --timestamping http://www.targetscan.org/mmu_50/ucsc/mm9/mm9Cons_ALL_CHRS.BED

    hgLoadBed mm9 targetScanS mm9Cons_ALL_CHRS.BED
    #	Loaded 38961 elements of size 6
    featureBits mm9 targetScanS
    #   253088 bases of 2620346127 (0.010%) in intersection

    # Create/edit/check in targetScans.html and trackDb.ra under
    # kent/src/hg/makeDb/trackDb/mouse/mm9

#########################################################################
# LASTZ Mouse mm9 (DONE - 2010-12-17 - hiram)
    mkdir /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17
    cd /hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17

    cat << '_EOF_' > DEF
# mouse vs zebrafish
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=5

# QUERY: Zebrafish danRer7
SEQ2_DIR=/scratch/data/danRer7/danRer7.2bit
SEQ2_LEN=/scratch/data/danRer7/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	Elapsed time: 254m42s

    cat fb.danRer7.chainMm9Link.txt 
    #	68190354 bases of 2620346127 (2.602%) in intersection

    #	and the swap to danRer7
    mkdir /hive/data/genomes/danRer7/bed/blastz.mm9.swap
    cd /hive/data/genomes/danRer7/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzDanRer7.2010-12-17/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    16m8.672s

    cat fb.danRer7.chainMm9Link.txt 
    #	71960602 bases of 1409770109 (5.104%) in intersection

#########################################################################
# YALE PSEUDOPIPE PSEUDOGENE PREDICTIONS BASED ON ENSEMBL 60
# (hartera, 2010-12-23 - 2010-12-24, DONE)
# FTP site e-mailed on 2010-12-22 by Suganthi Balasubramanian 
# (suganthi.bala@yale.edu) from the Gerstein lab. Data is from their 
# PseudoPipe pipeline and it is based on proteins from Ensembl Build 60 
# (pseudogene data from December 2010?). 
# NOTE: this data will be replaced before being released to the RR as there 
# is a problem with the IDs. 

mkdir -p /hive/data/genomes/mm9/bed/pseudoYale60
cd /hive/data/genomes/mm9/bed/pseudoYale60

# Go to http://tables.pseudogene.org/set.py?id=Mouse60 and click on the
# download link to download Mouse60.tx and copy the file to this directory.
# Header from data file.
ID      Chromosome      Start Coordinate        Stop Coordinate Strand  Parent
Protein  Protein Start   Protein Stop    Parent Gene     Fraction        Num
Insertions  Num Deletions   Num Shifts      Num Stops       E Value Identity
PolyA   Disablements    Exons   Introns Class   Sequence        Link

# urls are of type:
# http://tables.pseudogene.org/mouse60/<ID> so this can be added to the
# trackDb as for the previous track. Just update the Ensembl 59 trackDb entry.

# Get list of chroms:
awk '{print $2}' Mouse60.txt | sort | uniq
# chromosomes are 1-19, X, Y

# Convert data to genePred:

cat << '_EOF_' > formatPseudogenesToGenePred
#!/usr/bin/awk -f
# Parse Yale pseudogene data file.
# Exon coordinates are in this format: [[28688544, 28688864], [28689678, 2869117# 4], [28694308, 28694460], [28701327, 28701749]]
# Ignore header line
/^ID/ {
  next;
}
# Parse the data lines
BEGIN {FS="\t"} {OFS="\t"} {
  gsub(/\[/, "", $19);
  gsub(/\]/, "", $19); 
  split($19, exons, ",");
  # Count the number of start and end coordinates for exons and 
  # calculate the number of exons.
  count=(length(exons))/2;
  # Write out genePred. Add chr in front of chrom only if not haplotype.
  if ($2 !~ /HSCHR/) {
     printf "%s\tchr%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; 
  }
  else {
     printf "%s\t%s\t%c\t%d\t%d\t0\t0\t%d\t", $1, $2, $5, $3-1, $4, count; 
  }
  # get list of exon starts, convert from 1-based to 0-based
  for (i=1; i <= length(exons); i+=2) {
     printf "%d,", exons[i]-1","; 
  }
  printf "\t";
  # get list of exon ends
  for (i=2; i <= length(exons); i+=2) {
     printf "%d,", exons[i]","; 
  }
  printf "\n"; 
}
'_EOF_'

chmod +x formatPseudogenesToGenePred
# format the Yale pseudogenes data to genePred.
./formatPseudogenesToGenePred Mouse60.txt > pseudoYaleMouse60.gp 

# Load the genePred file into mm9
hgLoadGenePred mm9 pseudoYale60 pseudoYaleMouse60.gp
# Didn't load. There are 4 invalid genePreds:
Error: invalid genePred: PGOMOU00000130313 exon 1 overlaps previous exon
Error: invalid genePred: PGOMOU00000139101 exon 1 overlaps previous exon
Error: invalid genePred: PGOMOU00000136201 exon 1 overlaps previous exon
Error: invalid genePred: PGOMOU00000128816 exon 1 overlaps previous exon
Error: 4 invalid genePreds, database unchanged

# File didn't load into database.
# Make a file of these ids - invalidIds
grep -f invalidIds -vw pseudoYaleMouse60.gp > pseudoYaleMouse60NoInvalidGps.gp
wc -l pseudoYale*gp
# 19086 pseudoYaleMouse60.gp
# 19082 pseudoYaleMouse60NoInvalidGps.gp

# Then re-load database
hgLoadGenePred mm9 pseudoYale60 pseudoYaleMouse60NoInvalidGps.gp

# 2010-12-24
# Add trackDb.ra entry for track, add a search and make sure
# there is a description page, copy over from the gencodeYalePseudoBuild59
# html. 

cp
/hive/users/hartera/GencodeWG/ccds/trunk/gencode/browser/trackDb/human/hg19/gencodeYalePseudoBuild59.html \
~/kent/src/hg/makeDbb/trackDb/mouse/mm9/pseudoYale60.html
# Edit this for mouse and add the list of 4 IDs of genes that were removed due
# to overlapping exon coordinates. Commit to git.
# Check pseudogene types in data:
tail -n +2 Mouse60.txt | tawk '{print $21}' | sort | uniq
#Ambiguous
#Duplicated
#Processed
# Build class table for colouring pseudogenes by type. 
# copy over class table definition from a previous set of Yale pseudogenes.
cp -p /hive/groups/gencode/browser/hg19/gencodeYalePseudoBuild59/gencodeYalePseudoBuild59Class.sql \
 pseudoYale60Class.sql

# Make the class table file:
tail -n +2 Mouse60.txt \
  | tawk '{print $1, $21, "Yale"}' | sort > pseudoYale60Class.txt

# load table
hgLoadSqlTab mm9 pseudoYale60Class \
    pseudoYale60Class.sql pseudoYale60Class.txt

hgsql -e 'select distinct(class) from pseudoYale60Class;' mm9
+------------+
| class      |
+------------+
| Ambiguous  | 
| Processed  | 
| Duplicated | 
+------------+

# Add these classes to the trackDb.ra entry for the geneClasses field and 
# to the list of classes with colours.
# Added the classes to:
# ~/kentJuly2010/kent/src/hg/makeDb/trackDb/tagTypes.tab
# e.g. gClass_Processed genePred

#########################################################################
# Refresh mm9.knownToVisiGene
#
# After we fixed another issue on hgwdev and rebuilt knownToVisiGene
# it picked up the new mm9 ucsc genes.  Because that has not
# yet been released to RR, I remade it on hgwbeta.
ssh hgwbeta
knownToVisiGene mm9

#########################################################################
# SEGMENTAL DUPLICATIONS (REBUILT 9/26/20111 Fan)
    # corrected data file sent by email from John Huddleston [jlhudd@uw.edu].
    mkdir /cluster/data/mm9/bed/genomicSuperDups/09262011
    cd /cluster/data/mm9/bed/genomicSuperDups/09262011
    wget --timestamping ftp://mesh.gs.washington.edu/pub/UCSC/mm9genomicSuperDups.fixed.tab.gz
    gzip -d mm9genomicSuperDups.fixed.tab.gz
    awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' mm9genomicSuperDups.fixed.tab \
    | hgLoadBed mm9 genomicSuperDups stdin \
      -tab -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql
#####################################################################
# NCBI Incident database (DONE - 2011-02-10 - Hiram)
    # this procedure is run as a cron job in Hiram's account:

    #	43 09 * * * /hive/data/outside/ncbi/incidentDb/runUpdate.sh makeItSo

    # using the two scrips there: runUpdate.sh and update.sh
    # which are checked into the source tree as files:
    #	src/hg/utils/automation/ncbiIncidentUpdate.sh
    #	src/hg/utils/automation/ncbiRunIncidentUpdate.sh

    # they fetch the XML files from NCBI, convert them to SQL text
    # files, construct a bigBed file, and pushes it to genomewiki if
    # it is an update from previous

    # the table in the dataBase is: ncbiIncidentDb
    # which is the URL to the bb file, a single row:
    # http://genomewiki.ucsc.edu/images/8/85/Hg19.ncbiIncidentDb.bb

#########################################################################
# KOMP/IKMC (KNOCKOUT MOUSE PROJECT became Int'l Knockout Mouse Cons) (DONE 8/2/11 Fan)
# done 8/2/11  w/files emailed from Carol
# done 3/22/11 w/files emailed from Carol
# renamed to ikmc 3/25/10 at Carol's request
# done 3/12/10 w/files emailed from Carol 3/12
# done 12/8/09 w/files emailed from Carol 12/7
# done 7/24/09 w/files emailed from Carol 7/24
# done 5/7/09 w/files emailed from Carol Bult 5/7
# done 2/12/09 w/files emailed from Carol Bult 2/12
# done 10/21/08 w/files emailed from Carol Bult 10/18
    ssh hgwdev
    mkdir -p /hive/data/genomes/mm9/bed/ikmc/2011_08
    cd /hive/data/genomes/mm9/bed/ikmc/2011_08
    # Save files emailed from Carol Bult as 
    # 20110801_ikmc.gff.gz
    # Make bed12 with itemRgb:
    zcat 20110801_ikmc.gff.gz \
    | perl -we \
      'while (<>) { \
         s/\r?\n$//; \
         ($chr, undef, $ctr, $s, $e, undef, undef, undef, $id, $col, $n) = split("\t"); \
         if ($s eq "") { warn "$_\n";  s/^.*//; next; } # Some lines have no coords. \
         $col = ($col eq "Yellow") ? "255,215,0" : \
                ($col eq "Green")  ? "0,240,0" : \
                ($col eq "Blue")   ? "0,0,200" : "0,0,0"; \
         $s--; \
         $id =~ s/^MGI:\d+; (\w+); .*/$1/ || die "Cant parse id \"$id\""; \
         my $geneId = join("|", $chr, $ctr, "${n}_$id"); \
         push @{$geneBlks{$geneId}}, [$s, $e, $col] unless $e <= 0; \
      } \
      warn "Got " . scalar(keys %geneBlks) . " genes.\n"; \
      foreach my $geneId (keys %geneBlks) { \
         my @blks = @{$geneBlks{$geneId}}; \
         my ($chrom, $center, $name) = split(/\|/, $geneId); \
         my $blkCount = @blks; \
         @blks = sort {$a->[0] <=> $b->[0]} @blks; \
         my $chromStart = $blks[0]->[0]; \
         my $chromEnd = $blks[$blkCount-1]->[1]; \
         my $color = $blks[0]->[2]; \
         my $blkStarts = ""; \
         my $blkSizes = ""; \
         foreach my $blk (@blks) { \
           my ($start, $end, $col) = @{$blk}; \
           $blkStarts .= ($start - $chromStart) . ","; \
           $blkSizes  .= ($end - $start) . ","; \
           if ($col ne $color) { die "Blocks of $geneId of colors $color and $col"; } \
         } \
        print join("\t", $chrom, $chromStart, $chromEnd, $name, 0, ".", $chromStart, \
                   $chromStart, $color, $blkCount, $blkSizes, $blkStarts) . "\n"; \
      }' \
    | sort -k 1,1 -k 2n,2n > ikmc.bed
#Got 49000 genes.
    # No stderr empty-coord warnings this time (no unmapped items).
    # Make an alias-style table with associated info (MGI ID and status):
    zcat 20110801_ikmc.gff.gz \
    | perl -wpe 's/\r?\n$//; @w = split("\t"); \
      if ($w[3] eq "") { s/^.*//; next; } # Some lines have no coords. \
      if ($w[4] <= 0) { s/^.*//; next; } # A few lines have end=0. \
      $w[8] =~ m/^(MGI:\d+); (\w+); (\w.*)/ || die; \
      ($mgi, $designId, $status) = ($1, $2, $3); \
      $_ = "$w[10]_$designId\t$mgi,$w[2],$status\n";' \
    | sort -u > ikmcExtra.tab
    wc -l ikmcExtra.tab
#49000 ikmcExtra.tab
    # Load 'em up:
    hgLoadBed mm9 ikmc ikmc.bed
#Loaded 49000 elements of size 12
    hgLoadSqlTab mm9 ikmcExtra $HOME/kent/src/hg/lib/genericAlias.sql ikmcExtra.tab
    checkTableCoords -verbose=2 mm9 ikmc
#mm9.ikmc item Tekt3_41479 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.ikmc item Tekt3_41478 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.ikmc item Tekt3_41477 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.ikmc item Tekt3_41476 chr11:62887195-62896116: blocks 3 and 4 overlap.
#mm9.ikmc item Cbx1_93671 chr11:96659010-96669485: blocks 1 and 2 overlap.
#mm9.ikmc has 5 records with overlapping blocks.
#mm9.ikmc item Cbx3_93731 chr6:51423841-51433715: blocks 1 and 2 overlap.
#mm9.ikmc has 1 records with overlapping blocks.

hgsql mm9 -e 'delete from ikmc where name="Tekt3_41479"'
hgsql mm9 -e 'delete from ikmc where name="Tekt3_41478"'
hgsql mm9 -e 'delete from ikmc where name="Tekt3_41477"'
hgsql mm9 -e 'delete from ikmc where name="Tekt3_41476"'
hgsql mm9 -e 'delete from ikmc where name="Cbx3_93731"'

hgsql mm9 -e 'delete from ikmcExtra where name="Tekt3_41479"'
hgsql mm9 -e 'delete from ikmcExtra where name="Tekt3_41478"'
hgsql mm9 -e 'delete from ikmcExtra where name="Tekt3_41477"'
hgsql mm9 -e 'delete from ikmcExtra where name="Tekt3_41476"'
hgsql mm9 -e 'delete from ikmcExtra where name="Cbx3_93731"'

# Carol talked to the Sanger folks about those... pls waive.

    # Note from July '09: Carol noticed some very long items and is asking
    # Sanger about them.  Here's how to check it ourselves next time:
    hgsql mm9 -e 'select name, (chromEnd-chromStart) as length from ikmc \
                  where chromEnd - chromStart > 1000000 order by length desc;'
#+------------------+-----------+
#| name             | length    |
#+------------------+-----------+
#| Wdr63_92626      | 152775366 |
#| Gm4694_99936     | 102751437 |
#| Gp6_92307        |  92099696 |
#| H2-Gs10_68977    |  86354720 |
#| Nkain2_28864     |  67101356 |
#| Nkain2_28863     |  67101356 |
#| Galnt2_VG19744   |   4434884 |
#| Gm7971_99937     |   2323470 |
#| Ptprd_VG12763    |   2270723 |
#| Cntnap2_VG19736  |   2241309 |
#| Sp110_76800      |   2223096 |
#| Gm16522_101344   |   1809912 |
#| Gm10424_99920    |   1807191 |
#| Ctnna3_VG19733   |   1573570 |
#| Pcdh15_VG15967   |   1550393 |
#| Magi2_VG18895    |   1477753 |
#| Gpc5_VG15750     |   1432223 |
#| Naaladl2_VG19786 |   1339345 |
#| Agbl4_VG16439    |   1266664 |
#| Lingo2_VG18888   |   1244126 |
#| Prkg1_VG15918    |   1195585 |
#| Ptprt_VG10147    |   1139158 |
#| Nrg3_VG19738     |   1103949 |
#| Anks1b_VG16505   |   1099314 |
#| Erbb4_VG18672    |   1075874 |
#| Nrxn1_VG16178    |   1056424 |
#| Gm2023_101346    |   1040741 |
#| Ccl19_92178      |   1039536 |
+------------------+-----------+

    runJoiner.csh mm9 ikmc
#mm9.ikmcExtra.name - hits 49000 of 49000 ok
#########################################################################
# LASTZ Turkey MelGal1 ( DONE - 2011-03-30 - Chin)
    mkdir /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30
    cd /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30

    cat << '_EOF_' > DEF
# Turkey vs Mouse

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Turkey melGal1 - single chunk big enough to run entire chrom
SEQ2_DIR=/scratch/data/melGal1/melGal1.2bit
SEQ2_LEN=/scratch/data/melGal1/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1
    # real    71m8.450s
    cat fb.mm9.chainMelGal1Link.txt
    #   62597891 bases of 2620346127 (2.389%) in intersection
    cd /hive/data/genomes/mm9/bed
    ln -s lastzMelGal1.2011-03-30 lastz.melGal1

    #   running the swap 
    mkdir /hive/data/genomes/melGal1/bed/blastz.mm9.swap
    cd /hive/data/genomes/melGal1/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm9/bed/lastzMelGal1.2011-03-30/DEF \
        -swap \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1
    #   real    6m49.871s
    cat fb.melGal1.chainMm9Link.txt
    #   50649368 bases of 935922386 (5.412%) in intersection
    cd /hive/data/genomes/melGal1/bed
    ln -s blastz.mm9.swap lastz.mm9

############################################################################
# Nuclear Lamina (woring 2011-04-04 - Chin)
# The track is based on:
# "Molecular maps of the reorganization of genome-nuclear lamina 
# interactions during differentiation"
# Peric-Hupkes D, Meuleman W, Pagie L, Bruggeman SW et al. 
# Mol Cell 2010 May 28;38(4):603-13. PMID: 20513434
# GEO Accession Series GSE17051
# Main Contact: 
# Bas van Steensel
# Division of Gene Regulation, room B4.042
# Netherlands Cancer Institute
# Plesmanlaan 121
# 1066 CX Amsterdam
# The Netherlands
# Phone +31.20.5122040
# http://research.nki.nl/vansteensellab

    # Download data
    mkdir /hive/data/outside/vansteensel
    cd /hive/data/outside/vansteensel
    # Raw GEO data
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/series/GSE17051/GSE17051_RAW.tar
    # SOFT formatted family file(s)
     wget --timestamping \
       ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/GSE17051/GSE17051_family.soft.gz
    # MINiML formatted family file(s) 
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_series/GSE17051/GSE17051_family.xml.tgz
    # Series Matrix File(s)
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SeriesMatrix/GSE17051/GSE17051_series_matrix.txt.gz

    # wiggle data from vansteensel lab:
    # Embryonic Stem Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/ES.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/ES.wig.gz
    # Neuronal Precursor Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/NP.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/NP.wig.gz
    # Astrocytes
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/AC.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/AC.wig.gz
    # NIH3T3 Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/EF.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/EF.wig.gz

    mkdir /hive/data/genomes/mm9/bed/nuclearLamina
    cd /hive/data/genomes/mm9/bed/nuclearLamina
    cp /hive/data/outside/vansteensel/*.gff.gz .
    cp /hive/data/outside/vansteensel/*.wig.gz .
    gunzip *.gz

    # The wiggle files contain 51 set of duplicates (all on chr18), 
    # Per data provider's instruction, fixed them by taking
    # the average.
    cat << '_EOF_' > quickCheckWig.pl
#!/usr/bin/env perl
use strict;
use warnings;

my $chrom;
my $curPos;
my $curValue;
my $prevPos;
my $prevValue;
my $lineNum = 0;
my $tf1;
my $tf2;

open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n");
while (my $line = <FH>) {
    $lineNum += 1;   
    if ($line =~ m/^browser/ ) {
        next;
    } elsif ($line =~ m/track/ ) {
        next;
    } elsif ($line =~ m/^variableStep/ ) {
      ($tf1, $chrom, $tf2) = split('\s+', $line, 3);
      $chrom =~ s/^chrom=//;
      $curPos=0;
      $curValue=0;      
      $prevPos=0;
      $prevValue=0;
      next;
    } elsif ($line =~ m/^[0-9]/ ) {
	$prevPos = $curPos;
	$prevValue = $curValue;
        ($curPos, $curValue)=split('\s+', $line, 2);
	$curValue =~ s/\n//;
	$prevValue = $curValue;
        if ($curPos == $prevPos)  {
          printf("Duplicate %s %s on %s at line # %s \n", $curPos, $curValue, $chrom, $lineNum);
	  next;   
	}   
    }
} 
   
close (FH);
'_EOF_'
    #<< happy emacs
    chmod +x quickCheckWig.pl
    # check for duplicates in wiggles
for WIG in AC EF ES NP
do 
./quickCheckWig.pl ${WIG}.wig > ${WIG}.dup.list 2>&1
done

    cat << '_EOF_' > fixDupWig.pl
#!/usr/bin/env perl
use strict;
use warnings;
my $chrom;
my $curPos;
my $curValue;
my $prevPos;
my $prevValue;
my $tf1;
my $tf2;
sub resetAll {
  $curPos=0;
  $curValue=0;
  $prevPos=0;
  $prevValue=0;
}
my $lineNum = 0;
my $prtPos=0;
my $prtValue=0;
resetAll();
open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n");
while (my $line = <FH>) {
    $lineNum += 1;   
    if ($line =~ m/^browser/ ) {
	printf("%s", $line);
        next;
    } elsif ($line =~ m/track/ ) {
        printf("%s", $line);
        next;
    } elsif ($line =~ m/^variableStep/ ) {
      #  get chrom number  
      ($tf1, $chrom, $tf2) = split('\s+', $line, 3);
      $chrom =~ s/^chrom=//;
      &resetAll();      
      printf  ("%s",$line);
      next;
    } elsif ($line =~ m/^[0-9]/ ) {
        chomp($line);
        ($curPos, $curValue)=split('\s+', $line, 2);
        if ($prevPos == 0) {
          $prevPos = $curPos;
          $prevValue = $curValue;
	  next;
	} elsif ($prevPos == $curPos) {
	  $prevValue=($prevValue+$curValue)/2;
	 next;
        } else {
	 printf("%s\t%s\n", $prevPos, $prevValue);
          $prevPos=$curPos;
          $prevValue=$curValue;
	}
   }	
} 
close (FH);
'_EOF_'
    #<< happy emacs
    chmod +x fixDupWig.pl

    # fix duplicates in wiggles by takig average of duplicates
for WIG in AC EF ES NP
do
./fixDupWig.pl ${WIG}.wig > ${WIG}_Fixed.wig
done

   # tested by loading **_Fixed.wig as customer tracks


#########################################################################
# LASTZ Lizard AnoCar2 (DONE - 2011-04-25 - Hiram)
    # XXX FYI: the date on this directory is incorrect, it was done 04-25
    mkdir /hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19
    cd /hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19

    cat << '_EOF_' > DEF
# mouse vs lizard
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/scratch/data/blastz/HoxD55.q

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Lizard anoCar2
SEQ2_DIR=/scratch/data/anoCar2/anoCar2.2bit
SEQ2_LEN=/scratch/data/anoCar2/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0
SEQ2_LIMIT=40

BASE=/hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-syntenicNet -workhorse=hgwdev -smallClusterHub=encodek \
	-bigClusterHub=swarm -qRepeats=windowmaskerSdust > do.log 2>&1 &
    #	real    289m10.549s
    cat fb.mm9.chainAnoCar2Link.txt 
    #	88067954 bases of 2620346127 (3.361%) in intersection

    #	running the swap - DONE - 2011-04-19
    mkdir /hive/data/genomes/anoCar2/bed/blastz.mm9.swap
    cd /hive/data/genomes/anoCar2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzAnoCar2.2011-04-19/DEF \
	-noLoadChainSplit -chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-syntenicNet -swap -qRepeats=windowmaskerSdust > swap.log 2>&1 &
    #	real    14m28.747s
    cat fb.anoCar2.chainMm9Link.txt 
    #	84738440 bases of 1701353770 (4.981%) in intersection

##############################################################################
# BUILD mm9 GERP TRACK (DONE 4/25/11, Fan)

ssh hgwdev
mkdir /hive/data/genomes/mm9/bed/gerp
cd /hive/data/genomes/mm9/bed/gerp

# place the wig data file, All_mm9_RS.wig, here.

ulimit -d 180000000
ulimit -v 180000000

wigToBigWig All_mm9_RS.wig /hive/data/genomes/mm9/chrom.sizes All_mm9_RS.bw

ln -s `pwd`/All_mm9_RS.bw /gbdb/mm9/bbi/All_mm9_RS.bw

hgsql mm9 -e 'drop table if exists allMm9RS_BW; \
              create table allMm9RS_BW (fileName varchar(255) not null); \
	                    insert into allMm9RS_BW values ("/gbdb/mm9/bbi/All_mm9_RS.bw");'

# create corresponding trackDb.ra section and html description page.


############################################################################
# Nuclear Lamina (DONE 2011-04-04 - Chin)
#
# "Molecular maps of the reorganization of genome-nuclear lamina 
# interactions during differentiation"
# Peric-Hupkes D, Meuleman W, Pagie L, Bruggeman SW et al. 
# Mol Cell 2010 May 28;38(4):603-13. PMID: 20513434
# GEO Accession Series GSE17051
# Main Contact: 
# Bas van Steensel
# Division of Gene Regulation, room B4.042
# Netherlands Cancer Institute
# Plesmanlaan 121
# 1066 CX Amsterdam
# The Netherlands
# Phone +31.20.5122040
# http://research.nki.nl/vansteensellab

    # Download data
    mkdir /hive/data/outside/vansteensel
    cd /hive/data/outside/vansteensel
    # Raw GEO data which we did not use at this time,
    # get them any way to keep data in sync for the future
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/supplementary/series/GSE17051/GSE17051_RAW.tar
    # SOFT formatted family file(s)
     wget --timestamping \
       ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SOFT/by_series/GSE17051/GSE17051_family.soft.gz
    # MINiML formatted family file(s) 
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/MINiML/by_series/GSE17051/GSE17051_family.xml.tgz
    # Series Matrix File(s)
    wget --timestamping \
      ftp://ftp.ncbi.nih.gov/pub/geo/DATA/SeriesMatrix/GSE17051/GSE17051_series_matrix.txt.gz

    # wiggle data from vansteensel lab (2011-04-26):
    # Embryonic Stem Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/ES.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/ES.wig.gz
    # Neuronal Precursor Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/NP.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/NP.wig.gz
    # Astrocytes
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/AC.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/AC.wig.gz
    # NIH3T3 Cells
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/EF.gff.gz
     wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/EF.wig.gz

    ### new wiggles without duplicates from Wouter
    mkdir /hive/data/outside/vansteensel/2011-04-26
    cd /hive/data/outside/vansteensel/2011-04-26
    for WIG in AC EF ES NP
    do
      wget --timestamping \
        http://bioinformatics.nki.nl/~meuleman/Chin/${WIG}.wig.gz
    done
     


    mkdir /hive/data/genomes/mm9/bed/nuclearLamina/rawdata
    cd /hive/data/genomes/mm9/bed/nuclearLamina/rawdata
    cp /hive/data/outside/vansteensel/2011-04-26/*.wig.gz .
    gunzip *.gz

    # The wiggle files contain 51 set of duplicates (all on chr18), 
    # Per data provider's instruction, fixed them by taking
    # the average.
    cat << '_EOF_' > quickCheckWig.pl
#!/usr/bin/env perl
use strict;
use warnings;

my $chrom;
my $curPos;
my $curValue;
my $prevPos;
my $prevValue;
my $lineNum = 0;
my $tf1;
my $tf2;

open(FH, $ARGV[0]) or die("Error: cannot open file '$ARGV[0]'\n");
while (my $line = <FH>) {
    $lineNum += 1;   
    if ($line =~ m/^browser/ ) {
        next;
    } elsif ($line =~ m/track/ ) {
        next;
    } elsif ($line =~ m/^variableStep/ ) {
      ($tf1, $chrom, $tf2) = split('\s+', $line, 3);
      $chrom =~ s/^chrom=//;
      $curPos=0;
      $curValue=0;      
      $prevPos=0;
      $prevValue=0;
      next;
    } elsif ($line =~ m/^[0-9]/ ) {
	$prevPos = $curPos;
	$prevValue = $curValue;
        ($curPos, $curValue)=split('\s+', $line, 2);
	$curValue =~ s/\n//;
	$prevValue = $curValue;
        if ($curPos == $prevPos)  {
          printf("Duplicate %s %s on %s at line # %s \n", $curPos, $curValue, $chrom, $lineNum);
	  next;   
	}   
    }
} 
   
close (FH);
'_EOF_'
    #<< happy emacs
    chmod +x quickCheckWig.pl
    # check for duplicates in wiggles
for WIG in AC EF ES NP
do 
./quickCheckWig.pl ${WIG}.wig > ${WIG}.dup.list 2>&1
done
    wc -l *.dup.list
    # all dup.list are empty, so there is no duplicate in the wiggles.
    rm *.dup.list

    # load the tracks on hgwdev
    ssh hgwdev
    cd /cluster/data/mm9/bed/nuclearLamina/
    for WIG in AC EF ES NP
    do
      wigEncode ./rawdata/${WIG}.wig laminB1_${WIG}.wig \
          laminB1_${WIG}.wib
    done
    # Converted ./rawdata/AC.wig, upper limit 4.29, lower limit -5.74
    # Converted ./rawdata/EF.wig, upper limit 4.92, lower limit -5.44
    # Converted ./rawdata/ES.wig, upper limit 4.62, lower limit -5.57
    # Converted ./rawdata/NP.wig, upper limit 4.29, lower limit -6.00

    for WIG in AC EF ES NP
    do
      hgLoadWiggle mm9 laminB1_${WIG} laminB1_${WIG}.wig
    done
    # Connected to database mm9 for track laminB1_AC
    # Creating wiggle table definition in mm9.laminB1_AC
    # Saving wiggle.tab
    # Loading mm9
    # ... ...
    rm wiggle.tab
  
    for WIG in AC EF ES NP
    do
      ln -s /cluster/data/mm9/bed/nuclearLamina/laminB1_${WIG}.wib \
         /gbdb/mm9/wib/
    done

    # Create the laminB1 supertrack definitions in mm9/trackDb.ra 
    # copy the descrition html to right place
    cp /hive/data/outside/vansteensel/2011-05-11/mouse_laminB1_DamID.html \
    /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/mouse/mm9/laminB1Mm9.html
    cp /hive/data/outside/vansteensel/2011-05-11/MolCell2010_cartoon.png \
     /cluster/home/chinhli/kent/src/hg/htdocs/images/laminB1Mm9.png
    # edit/rescale the html and png files
    # add the new html and image files to git
    
    # collect wiggle stats for track definition:
    for WIG in AC EF ES NP
    do
      wigTableStats.sh mm9 laminB1_${WIG}
    done
# db.table      min max mean count sumData stdDev viewLimits
mm9.laminB1_AC  -5.742 4.293 -0.00201324 2102030 -4231.89 1.14068 viewLimits=-5.70541:4.293
# db.table      min max mean count sumData stdDev viewLimits
mm9.laminB1_EF  -5.444 4.922 -0.00154509 2102030 -3247.82 1.04232 viewLimits=-5.21314:4.922
# db.table      min max mean count sumData stdDev viewLimits
mm9.laminB1_ES  -5.572 4.617 -0.00133816 2102030 -2812.85 0.929746 viewLimits=-4.65007:4.617
# db.table      min max mean count sumData stdDev viewLimits
mm9.laminB1_NP  -5.998 4.292 -0.00148136 2102030 -3113.87 1.059 viewLimits=-5.29649:4.292

    # Create the laminB1 supertrack definitions in mm9/trackDb.ra 
    # copy the descrition html to right place
    cp /hive/data/outside/vansteensel/2011-05-11/mouse_laminB1_DamID.html \
    /cluster/home/chinhli/kent/src/hg/makeDb/trackDb/mouse/mm9/laminB1Mm9.html
    # scale down to 32%
    convert -size 32% \
        /hive/data/outside/vansteensel/2011-05-11/MolCell2010_cartoon.png \
    -resize 32% \
        /cluster/home/chinhli/kent/src/hg/htdocs/images/laminB1Mm9.png
    # edit/rescale the html and png files
    # add the new html and image files to git
    # cd ~/kent/src/hg/htdocs and make
    # cd ~/kent/src/hg/makeDb/trackDb and make DBS=mm9


#########################################################################
# LASTZ Cow BosTau6 (DONE - 2011-05-17 - Chin)
    mkdir /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17
    cd /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17

    cat << '_EOF_' > DEF
# mouse vs cow
# maximum M allowed with lastz is only 254
BLASTZ_M=254

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/nib
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Cow bosTau6 
SEQ2_DIR=/scratch/data/bosTau6/bosTau6.2bit
SEQ2_LEN=/scratch/data/bosTau6/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        `pwd`/DEF \
        -syntenicNet \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1
    # real     211m26.412s 
    cat fb.mm9.chainBosTau6Link.txt
    # 699351036 bases of 2620346127 (26.689%) in intersection
    # Create link
    cd /hive/data/genomes/mm9/bed
    ln -s  lastzBosTau6.2011-05-17 lastz.bosTau6


    #   and the swap 
    mkdir /hive/data/genomes/bosTau6/bed/blastz.mm9.swap
    cd /hive/data/genomes/bosTau6/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
        /hive/data/genomes/mm9/bed/lastzBosTau6.2011-05-17/DEF \
        -swap -syntenicNet  \
        -noLoadChainSplit \
        -workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
        -chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1
    #   real     53m5.237s
    cat fb.bosTau6.chainMm9Link.txt
    # 688894115 bases of 2649682029 (25.999%) in intersection
    cd /hive/data/genomes/bosTau6/bed
    ln -s blastz.mm9.swap lastz.mm9


############################################################################
# NUMTS TRACK (DONE 2011-06-03 - Chin)

    mkdir -p /hive/data/outside/Numts/mm9
    cd /hive/data/outside/Numts/mm9
    wget http://193.204.182.50/files/mm9/all_mouse_tracks.txt
    wget http://193.204.182.50/files/mm9/MMS_NumtS.html
    wget http://193.204.182.50/files/bam/MMS_NumtS.fasta.sorted.bam
    wget http://193.204.182.50/files/bam/MMS_NumtS.fasta.sorted.bam.bai

    mkdir /cluster/data/mm9/bed/NumtS
    cd  /cluster/data/mm9/bed/NumtS
    cp /hive/data/outside/Numts/mm9/*.* .


    # split the all_mouse_tracks.txt into 3 bed files 
    # mmsNumtSAssembled.bed, mmsNumtS.bed, an mmsNumtSMitochondrion.bed

    cat all_mouse_tracks.txt | awk ' /^track name/ {print $_}'  > tracks.list
    cat all_mouse_tracks.txt | awk ' /^track type/ {print $_}'  >> tracks.list

    # load the 3 bed files to mm9
    hgLoadBed mm9  numtSAssembled  mmsNumtSAssembled.bed
    hgLoadBed mm9 numtS mmsNumtS.bed
    hgLoadBed mm9 numtSMitochondrion mmsNumtSMitochondrion.bed
    # Make /gbdb/ links and load bam
    mkdir /gbdb/mm9/NumtS
    ln -s `pwd`/MMS_NumtS.fasta.sorted.bam{,.bai} /gbdb/mm9/NumtS/
    hgBbiDbLink mm9 bamMmsNumtSSorted /gbdb/mm9/NumtS/MMS_NumtS.fasta.sorted.bam 
    # setup trackDb for mm9

##############################################################################
# LASTZ X. tropicalis XenTro3 (DONE - 2011-09-20 - Hiram)
    mkdir /hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20
    cd /hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20

    cat << '_EOF_' > DEF
# Mouse (mm9) vs frog (xenTro2)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Mouse mm9
SEQ1_DIR=/scratch/data/mm9/mm9.2bit
SEQ1_LEN=/scratch/data/mm9/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Frog xenTro3
SEQ2_DIR=/scratch/data/xenTro3/xenTro3.2bit
SEQ2_LEN=/scratch/data/xenTro3/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    #	establish a screen to control this job
    screen
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	> do.log 2>&1 &
    #	real    382m43.129s
    cat fb.mm9.chainXenTro3Link.txt 
    #	81920795 bases of 2620346127 (3.126%) in intersection

    #	running the swap - DONE - 2011-09-21
    mkdir /hive/data/genomes/xenTro3/bed/blastz.mm9.swap
    cd /hive/data/genomes/xenTro3/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzXenTro3.2011-09-20/DEF \
	-chainMinScore=5000 -chainLinearGap=loose \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-swap > swap.log 2>&1 &
    #	real    57m36.529s
    cat fb.xenTro3.chainMm9Link.txt 
    #	89770014 bases of 1358334882 (6.609%) in intersection

##############################################################################
