# for emacs: -*- mode: sh; -*-


# Drosophila Melanogaster -- 
# 
# Berkeley Drosophila Genome Project (fruitfly.org) release 3.1 (Jan. 2003)
# http://www.fruitfly.org/annot/release3.html
#
# FlyBase (http://flybase.bio.indiana.edu/) last updated 20 Jan 2003
#

# DOWNLOAD SEQUENCE (DONE 10/2/03 angie)
    ssh kksilo
    mkdir /cluster/store6/dm1
    cd /cluster/data
    ln -s /cluster/store6/dm1 dm1
    cd /cluster/data/dm1
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_genomic_dmel_RELEASE3-1.FASTA
    faSplit byname whole_genome_genomic_dmel_RELEASE3-1.FASTA dummyArg

    # Follow FlyBase's lead on the chromosome names, but still use our 
    # "chr" prefix:
    foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh U)
      mkdir $c
      sed -e 's/^>/>chr/' $c.fa > $c/chr$c.fa
      echo chr$c.fa size:
      faSize $c.fa
      echo $c/chr$c.fa size:
      faSize $c/chr$c.fa
      echo comparison:
      faCmp $c.fa $c/chr$c.fa
      echo ""
    end
    # Carefully review output of those commands, then:
    rm 2L.fa 2R.fa 2h.fa 3L.fa 3R.fa 3h.fa 4.fa X.fa Xh.fa Yh.fa U.fa

    # put away the big download file
    mkdir -p downloads/fruitfly
    mv whole_genome_genomic_dmel_RELEASE3-1.FASTA downloads/fruitfly/


# SPLIT CHROM FA INTO SMALLER CHUNKS BY GAPS (DONE 10/3/03 angie)
    ssh kksilo
    cd /cluster/data/dm1
    foreach c (?{,?})
      faSplit -minGapSize=100 -lift=$c/chr$c.lft \
        gap $c/chr$c.fa 2000000 $c/chr${c}_
    end
    foreach ctgFa (?{,?}/chr*_*.fa)
      set ctg = $ctgFa:r
      mkdir $ctg
      mv $ctgFa $ctg
    end


# CREATING DATABASE (DONE 10/3/03 angie)
    # Create the database.
    ssh hgwdev
    echo 'create database dm1' | hgsql ''
    # Make a semi-permanent read-only alias:
    alias dm1 "mysql -u hguser -phguserstuff -A dm1"
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql


# EXTRACT GAP INFORMATION FROM FASTA, LOAD GAP TRACK (DONE 10/3/03 angie)
    ssh kksilo
    cd /cluster/data/dm1
    # size up the gap situation -- can we use gaps to extract agp info?
    faGapSizes downloads/fruitfly/whole_genome_genomic_dmel_RELEASE3-1.FASTA 
    # yup.  Jim's verdict:
    # I think that we can probably just show all gaps as bridged
    # in the non-h chromosomes, and as unbridged in the h chromosomes
    # and leave it at that.

    # Extract gaps using scaffoldFaToAgp.  It's really meant for a different 
    # purpose, so clean up its output: remove the .lft and .agp, and remove 
    # the last line of .gap (extra gap added at end).  Also substitute in 
    # the correct chrom name in .gap.  
    foreach c (?{,?})
      set chr = chr$c
      pushd $c
      scaffoldFaToAgp -minGapSize=100 $chr.fa
      rm $chr.{lft,agp}
      set chrSize = `faSize $chr.fa | awk '{print $1;}'`
      set origLines = `cat $chr.gap | wc -l`
      awk '($2 != '$chrSize'+1) {print;}' $chr.gap \
      | sed -e "s/chrUn/$chr/" > $chr.gap2
      set newLines = `cat $chr.gap2 | wc -l`
      if ($newLines == ($origLines - 1)) then
        mv $chr.gap2 $chr.gap
      else
        echo "Error: $chr/$chr.gap2 has wrong number of lines."
      endif
      popd
    end
    # Call the gaps unbridged in chrU and chr*h:
    foreach c (U ?h)
      set chr = chr$c
      sed -e 's/yes/no/' $c/$chr.gap > $c/$chr.gap2
      mv $c/$chr.gap2 $c/$chr.gap
    end
    ssh hgwdev
    hgLoadGap dm1 /cluster/data/dm1


# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 10/13/03 angie/donnak)
    ssh hgwdev
    # Write ~/kent/src/hg/makeDb/trackDb/drosophila/dm1/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r ~/kent/src/hg/makeDb/trackDb/drosophila/dm1/description.html
    # Check it in and copy (perhaps via a make in trackDb??) to 
    # /cluster/data/dm1/html.  
    mkdir -p /gbdb/dm1/html
    ln -s /cluster/data/dm1/html/description.html /gbdb/dm1/html/


# RUN REPEAT MASKER (DONE 10/4/03 angie)
    # Note: drosophila library ("drosophila.lib") is dated May 27 '03.
    # Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
    # RepeatMasker runs manageable on the cluster ==> results need lifting.

    # Split contigs into 500kb chunks:
    ssh kksilo
    cd /cluster/data/dm1
    foreach d ( */chr*_?{,?} )
      cd $d
      set contig = $d:t
      faSplit -minGapSize=100 -lift=$contig.lft -maxN=500000 \
        gap $contig.fa 500000 ${contig}_
      cd ../..
    end

    # make the run directory, output directory, and job list
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach d ( ?{,?}/chr*_?{,?} )
      set ctg = $d:t
      foreach f ( $d/${ctg}_?{,?}.fa )
        set f = $f:t
        echo /cluster/bin/scripts/RMDrosophila \
             /cluster/data/dm1/$d $f /cluster/data/dm1/$d \
           '{'check out line+ /cluster/data/dm1/$d/$f.out'}' \
        >> RMRun/RMJobs
      end
    end

    # do the run
    ssh kk
    cd /cluster/data/dm1/RMRun
    para create RMJobs
    para try
    para check
    para push
    para check,...
#Completed: 288 of 288 jobs
#CPU time in finished jobs:    1764726s   29412.10m   490.20h   20.43d  0.056 y
#IO & Wait Time:                  3392s      56.53m     0.94h    0.04d  0.000 y
#Average job time:                6139s     102.32m     1.71h    0.07d
#Longest job:                     7256s     120.93m     2.02h    0.08d
#Submission to last job:          7257s     120.95m     2.02h    0.08d

    # Lift up the split-contig .out's to contig-level .out's
    ssh kksilo
    cd /cluster/data/dm1
    foreach d ( ?{,?}/chr*_?{,?} )
      cd $d
      set contig = $d:t
      liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
      cd ../..
    end

    # Lift up the contig-level .out's to chr-level
    foreach c (?{,?})
      cd $c
      if (-e chr$c.lft && ! -z chr$c.lft) then
        echo lifting $c
        /cluster/bin/i386/liftUp chr$c.fa.out chr$c.lft warn \
          `awk '{print $2"/"$2".fa.out";}' chr$c.lft` > /dev/null
      else
        echo Can\'t find $c/chr$c.lft \!
      endif
      cd ..
    end

    # soft-mask contig .fa's with .out's
    foreach c (?{,?})
      foreach j ($c/chr${c}_?{,?}/chr${c}_?{,?}.fa)
        maskOutFa $j $j.out $j -soft
      end
      echo done $c
    end

    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut dm1 /cluster/data/dm1/?{,?}/*.fa.out


# SIMPLE REPEATS (TRF) (DONE 10/5/03 angie)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh kksilo
    mkdir /cluster/data/dm1/bed/simpleRepeat
    cd /cluster/data/dm1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach f (/cluster/data/dm1/?{,?}/chr*_*/chr?{,?}_?{,?}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # When job is done do:
    mkdir /cluster/data/dm1/jkStuff
    liftUp simpleRepeat.bed /cluster/data/dm1/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed dm1 simpleRepeat \
      /cluster/data/dm1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK (DONE 10/5/03 angie)
    # make a filtered version # of the trf output: 
    # keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/dm1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        echo "filtering $f"
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/dm1
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.lft warn \
        `awk '{print "bed/simpleRepeat/trfMask/"$2".bed";}' $c/chr$c.lft`
    end


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES (DONE 10/5/03 angie)
    ssh kksilo
    cd /cluster/data/dm1
    foreach c (?{,?})
      echo repeat- and trf-masking chr$c.fa
      /cluster/home/kent/bin/i386/maskOutFa -soft \
        $c/chr$c.fa $c/chr$c.fa.out $c/chr$c.fa
      /cluster/home/kent/bin/i386/maskOutFa -softAdd \
        $c/chr$c.fa bed/simpleRepeat/trfMaskChrom/chr$c.bed $c/chr$c.fa
    end
    foreach c (?{,?})
      echo repeat- and trf-masking contigs of chr$c
      foreach ctgFa ($c/chr*/chr${c}_?{,?}.fa)
        set trfMask=bed/simpleRepeat/trfMask/$ctgFa:t:r.bed
        /cluster/home/kent/bin/i386/maskOutFa -soft $ctgFa $ctgFa.out $ctgFa
        /cluster/home/kent/bin/i386/maskOutFa -softAdd $ctgFa $trfMask $ctgFa
      end
    end


# STORE SEQUENCE AND ASSEMBLY INFORMATION (DONE 10/5/03 angie)

    # Translate to nib
    ssh kksilo
    cd /cluster/data/dm1
    mkdir nib
    foreach c (?{,?})
      faToNib -softMask $c/chr$c.fa nib/chr$c.nib
    end

    # Make symbolic links from /gbdb/dm1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/dm1/nib
    foreach f (/cluster/data/dm1/nib/chr*.nib)
      ln -s $f /gbdb/dm1/nib
    end

    # Load /gbdb/dm1/nib paths into database and save size info.
    hgsql dm1  < ~/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib dm1 /gbdb/dm1/nib /cluster/data/dm1/?{,?}/chr?{,?}.fa
    echo "select chrom,size from chromInfo" | hgsql -N dm1 \
      > /cluster/data/dm1/chrom.sizes


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/5/03 angie)
    # Copy all the data from the table "grp" 
    # in the existing database "rn1" to the new database
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql dm1


# MAKE GCPERCENT (DONE 10/5/03 angie)
     ssh hgwdev
     mkdir /cluster/data/dm1/bed/gcPercent
     cd /cluster/data/dm1/bed/gcPercent
     # create and load gcPercent table
     hgsql dm1  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent dm1 ../../nib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR DROSOPHILA (DONE 10/8/03 angie)
    # Warning: must genome and organism fields must correspond
    # with defaultDb values
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome, scientificName, \
                htmlPath, hgNearOk, hgPbOk, sourceName) values \
        ("dm1", "Jan. 2003", "/gbdb/dm1/nib", "Fruitfly", \
               "chr2L:827700-845800", 1, 55, "Fruitfly", \
                "Drosophila melanogaster", "/gbdb/dm1/html/description.html", \
                0, 0, "BDGP v. 3");' \
      | hgsql -h genome-testdb hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Fruitfly", "dm1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add dm1 in all the right places and do
    make update

    # go public on genome-test
    #make alpha
    cvs commit makefile

    # Add trackDb directories
    mkdir drosophila
    mkdir drosophila/dm1
    cvs add drosophila
    cvs add drosophila/dm1
    cvs commit drosophila


# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR DROSOPHILA (DONE 10/17/03 angie)
    ssh hgwdev
    # Get appropriate hostname from cluster admins
    # 8/26/04: set canPcr=1 for untranslated blat server.
    echo 'insert into blatServers values("dm1", "blat10", 17787, 1, 0); \
          insert into blatServers values("dm1", "blat10", 17786, 0, 1);' \
      | hgsql -h genome-testdb hgcentraltest


# LOAD UP BDGP 3.1 ANNOTATIONS (DONE 10/8/03 angie)
    # fruitfly.org is the Berkeley Drosophila Genome Project.  
    # Their GFF annotations contain FlyBase IDs useful for cross-linking.
    ssh kksilo
    mkdir /cluster/data/dm1/bed/bdgpAnnotations
    cd /cluster/data/dm1/bed/bdgpAnnotations
    # Download all available annotations:
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_3_UTR_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_5_UTR_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_CDS_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_annotation_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_exon_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_intron_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_splice_site_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_tRNA_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_transcript_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_translation_dmel_RELEASE3-1.GFF.gz
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/GFF/whole_genome_transposable_element_dmel_RELEASE3-1.GFF.gz
    gunzip *.gz
    # Protein-coding genes...
    perl -wpe 's/^/chr/; s/translation/CDS/; \
               s/genegrp.*transgrp=(\S+);.*$/$1/; \
               s/genegrp.*transgrp=(\S+)$/$1/;' \
      whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF \
    > bdgpGene.gff
    # Loading a .tab file caused some lines with super-long go fields 
    # to be skipped.  Generating a .sql file with INSERTs works.
    perl -we 'while (<>) { \
                chop; @fields = split("\t"); \
                if ($fields[2] eq "gene") { \
                  @vars = split("; ", $fields[8]); \
                  $go = "";  $cdna_clone = ""; \
                  foreach $v (@vars) { \
                    @vv = split("=", $v); \
                    if ($vv[0] eq "name") { \
                      $bdgpName = $vv[1]; \
                    } elsif ($vv[0] eq "dbxref") { \
                      if ($vv[1] =~ /^GO:(\d+)/) { \
                        $go .= "$1,"; \
                      } elsif ($vv[1] =~ /FlyBase:(\w+)/) { \
                        $flybase = $1; \
                      } else { \
                        die "unrecognized dbxref $vv[1]"; \
                      } \
                    } elsif ($vv[0] eq "symbol") { \
                      $symbol = $vv[1]; \
                    } elsif ($vv[0] eq "cytorange") { \
                      $cytorange = $vv[1]; \
                    } elsif ($vv[0] eq "cdna_clone") { \
                      $cdna_clone .= "$vv[1],"; \
                    } elsif ($vv[0] eq "genegrp") { \
                    } else { \
                      die "unrecognized var $v" \
                    } \
                  } \
                  print "INSERT INTO bdgpGeneInfo VALUES ( \"$bdgpName\", \"$flybase\", \"$go\", \"$symbol\", \"$cytorange\", \"$cdna_clone\");\n"; \
                } \
              }' \
      whole_genome_protein-coding-gene_dmel_RELEASE3-1.GFF \
    > bdgpGeneInfo.sql
    # Proteins for coding genes:
    wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
    gunzip -c whole_genome_translation_dmel_RELEASE3-1.FASTA.gz \
    | perl -wpe 's/^>(pp-)*(\w+)-\w(\w).*/>$2-R$3/' \
    > bdgpGenePep.fa
    # load up coding genes, proteins and assoc. info:
    ssh hgwdev
    ldHgGene dm1 bdgpGene /cluster/data/dm1/bed/bdgpAnnotations/bdgpGene.gff
    hgPepPred dm1 generic bdgpGenePep \
      /cluster/data/dm1/bed/bdgpAnnotations/bdgpGenePep.fa
    hgsql dm1 < $HOME/src/hg/lib/bdgpGeneInfo.sql
    hgsql dm1 < /cluster/data/dm1/bed/bdgpAnnotations/bdgpGeneInfo.sql
    # Non-coding genes...
    perl -wpe 's/^/chr/; \
               s/genegrp.*transgrp=(\S+);.*$/$1/; \
               s/genegrp.*transgrp=(\S+)$/$1/;' \
      whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF \
    > bdgpNonCoding.gff
    sed -e 's/bdgpGeneInfo/bdgpNonCodingInfo/' \
      ~/kent/src/hg/lib/bdgpGeneInfo.sql \
      > bdgpNonCodingInfo.sql
    perl -we 'while (<>) { \
                chop; @fields = split("\t"); \
                if (($fields[2] ne "exon") && \
                    ($fields[2] ne "transcript") && \
                    ($fields[2] ne "translation")) { \
                  @vars = split("; ", $fields[8]); \
                  $go = "";  $cdna_clone = ""; \
                  foreach $v (@vars) { \
                    @vv = split("=", $v); \
                    if ($vv[0] eq "name") { \
                      $bdgpName = $vv[1]; \
                    } elsif ($vv[0] eq "dbxref") { \
                      if ($vv[1] =~ /^GO:(\d+)/) { \
                        $go .= "$1,"; \
                      } elsif ($vv[1] =~ /FlyBase:(\w+)/) { \
                        $flybase = $1; \
                      } else { \
                        die "unrecognized dbxref $vv[1]"; \
                      } \
                    } elsif ($vv[0] eq "symbol") { \
                      $symbol = $vv[1]; \
                    } elsif ($vv[0] eq "cytorange") { \
                      $cytorange = $vv[1]; \
                    } elsif ($vv[0] eq "cdna_clone") { \
                      $cdna_clone .= "$vv[1],"; \
                    } elsif ($vv[0] eq "genegrp") { \
                    } else { \
                      die "unrecognized var $v" \
                    } \
                  } \
                  print "INSERT INTO bdgpNonCodingInfo VALUES ( \"$bdgpName\", \"$flybase\", \"$go\", \"$symbol\", \"$cytorange\", \"$cdna_clone\");\n"; \
                } \
              }' \
      whole_genome_noncoding-gene_dmel_RELEASE3-1.GFF \
    >> bdgpNonCodingInfo.sql
    ssh hgwdev
    ldHgGene dm1 bdgpNonCoding \
      /cluster/data/dm1/bed/bdgpAnnotations/bdgpNonCoding.gff
    hgsql dm1 < /cluster/data/dm1/bed/bdgpAnnotations/bdgpNonCodingInfo.sql


# FLYBASE GENES (REDONE 6/28/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/flyBase
    cd /cluster/data/dm1/bed/flyBase
    wget -O genes.txt.040617 \
      ftp://flybase.bio.indiana.edu/flybase/genes/genes.txt
    # Had to edit genes.txt.040617 to get around these two lines that didn't 
    # start with acode symbols:
    #line 258894
#Allele class: hypom
    #line 459025 (459026 of original)
#uncertain *u FBan0017679; annotated data are available for this gene.
    #line 3128725 (3128726 of original), for Trn-SR / FBgn0031456:
#%f nu
    hgFlyBase dm1 genes.txt.040617


# BDGP GENE DISRUPTION PROJECT/PSCREEN (DONE 7/29/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/pscreen
    cd /cluster/data/dm1/bed/pscreen
    # Robin Hiesinger emailed 2 table dumps that need to be joined on ID;
    # saved as atbloom.dump, genetag_gdp.dump.
    # Turns out that this file has a more complete mapping than atbloom --
    # use it to fill in stock numbers missing from atbloom:
    wget http://flybase.bio.indiana.edu/stocks/stock-centers/bloomington/lk/bloomington.csv
    dos2unix atbloom.dump
    # one-shot Perl script:
    chmod a+x mkPscreen.pl
    mkPscreen.pl genetag_gdp.dump atbloom.dump bloomington.csv > pscreen.bed
    hgLoadBed dm1 pscreen -sqlTable=$HOME/kent/src/hg/lib/pscreen.sql -tab \
      pscreen.bed
    # 8/13/04: Galt found some duplicate IDs and Robin/Bob(?) suggested 
    # fixing them as follows:
    hgsql dm1 -e '\
      delete from pscreen where name = "KG05017" limit 1; \
      delete from pscreen where name = "EY14401" or name = "PA07741"; \
      delete from pscreen where name = "EY14474" and chromEnd = "8975785"; \
      delete from pscreen where name = "KV00692" and chromEnd = "4702749";'
    # 8/26/04: Drop geneDeltas column due to incomplete / possibly stale 
    # data.  (removed it from pscreen.as & derivatives, hgc.c first)
    # Also remove 3 items mapped off the end of chrX.
    hgsql dm1 -e 'alter table pscreen drop column geneDeltas;'
    hgsql dm1 -e '\
      delete from pscreen where name = "KG03740"; \
      delete from pscreen where name = "BG01196"; \
      delete from pscreen where name = "EY02003";'
    # 9/16/04: Bob Levis emailed an update... a bunch of deletions, 
    # 1 change and 2 additions:
    # (see email text and UCSC_changes_091604v2.csv)
    hgsql dm1 -e '\
      delete from pscreen where name = "BG02404"; \
      delete from pscreen where name = "EY03152"; \
      delete from pscreen where name = "EY05239"; \
      delete from pscreen where name = "EY05534"; \
      delete from pscreen where name = "EY07100"; \
      delete from pscreen where name = "EY09238"; \
      delete from pscreen where name = "EY11132"; \
      delete from pscreen where name = "EY11235"; \
      delete from pscreen where name = "EY11805"; \
      delete from pscreen where name = "EY12014"; \
      delete from pscreen where name = "EY12019"; \
      delete from pscreen where name = "EY12603"; \
      delete from pscreen where name = "EY13113"; \
      delete from pscreen where name = "EY13711"; \
      delete from pscreen where name = "EY14401"; \
      delete from pscreen where name = "KG03002"; \
      delete from pscreen where name = "KG04889";'
    hgsql dm1 -e 'update pscreen set geneIds = "CG33472," where name = "EY04063";'
    hgsql dm1 -e 'update pscreen set chrom = "chr2L" where name = "EY11972";'
    hgsql dm1 -e 'insert into pscreen values(672, "chrX", 11430596, 11430597, "EY12008", 0, "+", 0, 1, "CG1830,");'
    hgsql dm1 -e 'insert into pscreen values(720, "chr2R", 17764291, 17764292, "EY12526", 0, "-", 0, 1, "CG30092,");'

#QA NOTE: [ASZ, 10-1-2008] mytouch dm1 pscreen 200501071700.00
#(because it was not updated after a bdgpGene update (pscreen is outdated)


# FLYBASE IN SITU IMAGES / EXPRESSION (REDONE 6/17/04 angie)
    # FlyBase has downloadable in situ images for BACs:
    # ftp://flybase.net/flybase/images/bac_insitu_pic/*
    # and FBti's:
    # ftp://flybase.net/flybase/images/in-situ-images/insitus.zip
    # but what Jim is interested in is the insitus for expression data.
    # Don't see that in the ftp listing, but they do make it easy to link in.
    ssh hgwdev
    cd /cluster/data/dm1/bed/flyBase
    wget -O summary.txt.040617 \
    'http://www.fruitfly.org/cgi-bin/ex/bquery.pl?qpage=entry&qtype=summarytext'
    hgsql dm1 < ~/kent/src/hg/lib/bdgpExprLink.sql
    hgsql dm1 -e \
      'load data local infile "summary.txt.040617" into table bdgpExprLink'


# SWISSPROT-FLYBASE CROSS-REFERENCING  (REDONE 6/17/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/flyBaseSwissProt
    cd /cluster/data/dm1/bed/flyBaseSwissProt
    echo "select extAcc1,acc from extDbRef,extDb where extDbRef.extDb = extDb.id and extDb.val = 'flybase'" \
    | hgsql -N sp040515 \
    > fbSpAcc.tab
    ssh kksilo
    cd /cluster/data/dm1/bed/flyBaseSwissProt
    wget ftp://ftp.ebi.ac.uk/pub/databases/SPproteomes/fasta_files/proteomes/7227.FASTAC
    # Some of those SwissProt "names" are > 255 chars!  trim the few long ones.
    perl -we 'open(F, "fbSpAcc.tab") || die; \
              %sp2fb = (); \
              while (<F>) { \
                chop;  @words = split("\t"); \
                $sp2fb{$words[1]} = $words[0]; \
              } \
              close(F); \
              while (<>) { \
                if (/^>(\w+)\s+\((\w+)\)\s+(.*)/) { \
                  $fbAcc = $sp2fb{$2}; \
                  $spAcc = $3; \
                  $spAcc = substr($3, 0, 250) . "..." if (length($3) > 255); \
                  print "$fbAcc\t$2\t$spAcc\t$1\n" if (defined $fbAcc); \
                } \
              }' \
      7227.FASTAC \
    > flyBaseSwissProt.tab
    rm 7227.FASTAC
    ssh hgwdev
    hgsql dm1 < $HOME/src/hg/lib/flyBaseSwissProt.sql
    echo 'load data local infile "/cluster/data/dm1/bed/flyBaseSwissProt/flyBaseSwissProt.tab" into table flyBaseSwissProt' \
    | hgsql dm1


# AUTO UPDATE GENBANK MRNA RUN  (DONE 10/9/03 angie)

    # Put the nib's on /cluster/bluearc:
    ssh kksilo
    mkdir /cluster/bluearc/drosophila
    mkdir /cluster/bluearc/drosophila/dm1
    cp -pR /cluster/data/dm1/nib /cluster/bluearc/drosophila/dm1

    # Instructions for setting up incremental genbank updates are here:
    # http://www.soe.ucsc.edu/~markd/genbank-update/doc/initial-load.html
    # This time around, Markd handled adding the new species to gbGenome.c 
    # because it's not yet in the kent tree.  

    # Edit /cluster/data/genbank/etc/genbank.conf and add:
# dm1
dm1.genome = /cluster/bluearc/drosophila/dm1/nib/chr*.nib
dm1.lift = /cluster/data/dm1/jkStuff/liftAll.lft
dm1.genbank.mrna.xeno.load = yes
dm1.genbank.est.xeno.load = no
dm1.downloadDir = dm1

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
      -srcDb=genbank -type=mrna -verbose=1 -initial dm1

    # Load the results from the above
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad dm1

    ssh eieio
    # To get this next one started, the work directory of the initial run 
    # needs to be moved out of the way.  
    rm -r /cluster/bluearc/genbank/work/initial.dm1
    # Now align refseqs:
    cd /cluster/data/genbank
    nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
      -srcDb=refseq -type=mrna -verbose=1 -initial dm1
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 dm1

    ssh eieio
    # To get this next one started, the work directory of the initial run 
    # needs to be moved out of the way.  
    rm -r /cluster/bluearc/genbank/work/initial.dm1
    # Now align ESTs:
    nice bin/gbAlignStep -iserver=no -clusterRootDir=/cluster/bluearc/genbank \
      -srcDb=genbank -type=est -verbose=1 -initial dm1
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 dm1
    # Clean up:
    rm -r /cluster/bluearc/genbank/work/initial.dm1


# PUT NIBS ON ISCRATCH (DONE 10/9/03 angie)
    ssh kkr1u00
    cd /iscratch/i/dm1
    cp -pR /cluster/data/dm1/nib .
    iSync


# PRODUCING FUGU FISH ALIGNMENTS  (DONE 10/9/03 angie)
    # Assumes masked NIBs have been prepared as above
    # and Fugu pieces are already on kluster /iscratch/i.
    # next machine
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blatFugu
    cd /cluster/data/dm1/bed/blatFugu
    mkdir psl
    ls -1S /iscratch/i/fugu/*.fa > fugu.lst
    ls -1S /iscratch/i/dm1/nib/chr*.nib > fly.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -q=dnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 fly.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
#Completed: 1408 of 1408 jobs
#Average job time:                 217s       3.61m     0.06h    0.00d
#Longest job:                     1431s      23.85m     0.40h    0.02d
#Submission to last job:          1433s      23.88m     0.40h    0.02d

    # When cluster run is done, sort alignments
    # append _blatFugu to chrom for .psl file names.
    ssh kksilo
    cd /cluster/data/dm1/bed/blatFugu
    foreach c (2L 2R 2h 3L 3R 3h 4 X Xh Yh U)
      echo -n "chr${c} "
      pslCat psl/chr${c}_*.psl \
      | pslSortAcc nohead chrom temp stdin
      rm -f chrom/chr${c}_blatFugu.psl
      mv chrom/chr${c}.psl chrom/chr${c}_blatFugu.psl
    end

    # Load alignments
    ssh hgwdev
    cd /cluster/data/dm1/bed/blatFugu/chrom
    hgLoadPsl -noTNameIx dm1 chr*_blatFugu.psl

    # Make fugu /gbdb/ symlink and load Fugu sequence data.
    mkdir /gbdb/dm1/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/dm1/fuguSeq
    # ! ! !  DO NOT RUN hgLoadSeq in /gbdb - it leaves .tab files
    cd /cluster/data/dm1/bed/blatFugu
    hgLoadSeq dm1 /gbdb/dm1/fuguSeq/fugu_v3_mask.fasta


# BLASTZ D.PSEUDOOBSCURA (DONE 8/3/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.dp2.2004-08-03
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03
    ln -s blastz.dp2.2004-08-03 /cluster/data/dm1/bed/blastz.dp2
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.pseudoobscura
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp2/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.dp2.2004-08-03

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run
    para create j
    para try, check, push, check, ....
#Completed: 15939 of 15939 jobs
#Average job time:                  13s       0.21m     0.00h    0.00d
#Longest job:                      294s       4.90m     0.08h    0.00d
#Submission to last job:          1118s      18.63m     0.31h    0.01d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  18s       0.30m     0.01h    0.00d
#Longest job:                       25s       0.42m     0.01h    0.00d
#Submission to last job:            55s       0.92m     0.02h    0.00d
    cd ..
    rm -r raw

    # Translate .lav to axt, with dp2 in scaffold coords for collaborators:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib /cluster/data/dp2/nib stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN PSEUDOOBSCURA BLASTZ (DONE 8/3/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/dp2/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                   8s       0.13m     0.00h    0.00d
#Longest job:                       14s       0.23m     0.00h    0.00d
#Submission to last job:            14s       0.23m     0.00h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDp2 $i
    end


# NET PSEUDOOBSCURA BLASTZ (DONE 8/3/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    netClass -noAr noClass.net dm1 dp2 pseudoobscura.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    rm noClass.net
    netFilter -syn pseudoobscura.net > pseudoobscuraSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    netFilter -minGap=10 pseudoobscura.net |  hgLoadNet dm1 netDp2 stdin
    netFilter -minGap=10 pseudoobscuraSyn.net \
    | hgLoadNet dm1 netSyntenyDp2 stdin


# MAKE VSDP2 DOWNLOADABLES (DONE 8/17/04 angie)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDp2
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDp2
    gzip -c \
      /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/all.chain \
      > pseudoobscura.chain.gz
    gzip -c \
      /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain/pseudoobscura.net \
      > pseudoobscura.net.gz
    mkdir axtNet
    foreach f (/cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtNet/chr*axt)
      gzip -c $f > axtNet/$f:t.gz
    end
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


PRODUCING GENSCAN PREDICTIONS (TODO 11/14/03 angie)
    # Run on small cluster -- genscan needs big mem.
    ssh kkr1u00
    mkdir /cluster/data/dm1/bed/genscan
    cd /cluster/data/dm1/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Make hard-masked contigs
    foreach f (/cluster/data/dm1/?{,?}/chr*/chr?{,?}_?{,?}.fa)
      maskOutFa $f hard $f.masked
    end
    # Generate a list file, contigs.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f contigs.list
    touch contigs.list
    foreach f ( `ls -1S /cluster/data/dm1/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> contigs.list
    end
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 contigs.list single gsub jobList
    para create jobList
    para try
    para check
    para push
#Completed: 79 of 79 jobs
#Average job time:                 136s       2.26m     0.04h    0.00d
#Longest job:                      229s       3.82m     0.06h    0.00d
#Submission to last job:          1718s      28.63m     0.48h    0.02d

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    # chr14_21, chr16_4
    
    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/dm1/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/dm1/bed/genscan
    ldHgGene dm1 genscan genscan.gtf
    hgPepPred dm1 generic genscanPep genscan.pep
    hgLoadBed dm1 genscanSubopt genscanSubopt.bed


# EXTEND BDGPGENE AND CREATE BDGPNEAR FOR HGNEAR (REDONE 6/18/04 angie)
    ssh hgwdev
    cd /cluster/data/dm1/bed/bdgpAnnotations.3.2
    cp ~/kent/src/hg/lib/bdgpSwissProt.sql .
    perl -we '%bName2all = (); \
              %bName2fb = (); \
              open(P, "echo \"select * from bdgpGeneInfo\" | hgsql -N dm1|") \
                || die; \
              while (<P>) { \
                chop; my @words = split("\t"); \
                $bName2fb{$words[0]} = \@words; \
              } \
              close(P); \
              open(P, "echo \"select bdgpGeneInfo.*,flyBaseSwissProt.* from bdgpGeneInfo,flyBaseSwissProt where bdgpGeneInfo.flyBaseId = flyBaseSwissProt.flyBaseId\" | hgsql -N dm1|") || die; \
              while (<P>) { \
                chop; my @words = split("\t"); \
                $bName2all{$words[0]} = \@words; \
              } \
              close(P); \
              open(P, "echo select name from bdgpGene | hgsql -N dm1|") ||die; \
              while (<P>) { \
                chop; $name = $_; \
                $bName = $name;  $bName =~ s/-R.*//; \
                if (exists($bName2all{$bName})) { \
                  ($bName, $fbID, $go, $symbol, $cyto, undef, \
                   undef, $spID, $spDesc, $spSymb) = @{$bName2all{$bName}}; \
                  print "INSERT INTO bdgpSwissProt VALUES ( \"$name\", \"$fbID\", \"$go\", \"$symbol\", \"$cyto\", \"$spID\", \"$spDesc\", \"$spSymb\");\n"; \
                } elsif (exists($bName2fb{$bName})) { \
                  ($bName, $fbID, $go, $symbol, $cyto, undef) = @{$bName2fb{$bName}}; \
                  $go = "" if (not defined $go); \
                  $cyto = "" if (not defined $cyto); \
                  print "INSERT INTO bdgpSwissProt VALUES ( \"$name\", \"$fbID\", \"$go\", \"$symbol\", \"$cyto\", \"n/a\", \"n/a\", \"n/a\");\n"; \
                } else { die "No info for $name."; } \
              } \
              close(P); ' \
    >> bdgpSwissProt.sql
    hgsql dm1 < bdgpSwissProt.sql

    # Re-ran the following steps 1/7/05 after reloading bdgpGene 
    # without -genePredExt, see 1/6/05 note below...:
    # Use the above table to add a proteinID field to bdgpGene.  
    hgsql dm1 -e 'create table bdgpGene2 \
                  select bdgpGene.*, bdgpSwissProt.swissProtId as proteinID \
                  from bdgpGene, bdgpSwissProt \
                  where bdgpGene.name = bdgpSwissProt.bdgpName'

    # Now examine bdgpGene2 vs. bdgpGene manually, carefully.  
    # Do they have the same # rows?  
    hgsql dm1 -N -e 'select count(*) from bdgpGene'
    hgsql dm1 -N -e 'select count(*) from bdgpGene2'
    # Are most proteinID fields non-"n/a"? 
    hgsql dm1 -N -e 'select count(*) from bdgpGene2 where proteinID = "n/a"'
    # Spot-check some genes... are the fields of bdgpGene2 identical to 
    # the fields of bdgpGene, except for the new proteinIDs?  
    hgsql dm1 -N -e 'select * from bdgpGene limit 3'
    hgsql dm1 -N -e 'select * from bdgpGene2 limit 3'
    # If so, then go ahead:
    hgsql dm1 -e 'drop table bdgpGene; rename table bdgpGene2 to bdgpGene;'
    # and check the new bdgpGene manually.

# MYTOUCH FIX - Jen - 2006-01-24
  sudo mytouch dm1 arbExpDistance 0501081700.00
  sudo mytouch dm1 arbExpDistance 0501081700.00
  sudo mytouch dm1 arbExpDistance 0501081700.00
  sudo mytouch dm1 bdgpBlastTab 0501081700.00
  sudo mytouch dm1 bdgpBlastTab 0501081700.00
  sudo mytouch dm1 bdgpCanonical 0501081700.00
  sudo mytouch dm1 bdgpIsoforms 0501081700.00
  sudo mytouch dm1 bdgpGenePep 0501081700.00
  sudo mytouch dm1 bdgpSwissProt 0501081700.00
  sudo mytouch dm1 bdgpToCanonical 0501081700.00
  sudo mytouch dm1 bdgpToLocusLink 0501081700.00
  sudo mytouch dm1 bdgpToPfam 0501081700.00
  sudo mytouch dm1 bdgpToRefSeq 0501081700.00
  sudo mytouch dm1 ceBlastTab 0501081700.00
  sudo mytouch dm1 drBlastTab 0501081700.00
  sudo mytouch dm1 hgBlastTab 0501081700.00
  sudo mytouch dm1 mmBlastTab 0501081700.00
  sudo mytouch dm1 scBlastTab 0501081700.00
  sudo mytouch dm1 fbTranscript 0501081700.00
  sudo mytouch dm1 hgBlastTab 0501081700.00
  sudo mytouch dm1 blastFBPep00 0501081700.00


#############################################################################
# MAKE HGNEAR
# adapted from makeHgNear.doc; split into sections.  See makeHgFixed.doc 
# for how Arbeitman et al's fly lifecycle expression data were loaded into 
# hgFixed.  

# BLASTP SELF, CLUSTER GENES, MAP TO EXP.DATA FOR HGNEAR (REDONE 6/18/04 angie)
    ssh hgwdev
    # Now that bdgpGene has proteinID, use hgClusterGenes to cluster
    # together various alt-splicing isoforms, creating the tables
    # bdgpIsoforms and bdgpCanonical.  
    hgClusterGenes dm1 bdgpGene bdgpIsoforms bdgpCanonical
    # Extract peptides from bdgpGenes into fasta file
    # and create a blast database out of them.
    mkdir /cluster/data/dm1/bed/blastp
    cd /cluster/data/dm1/bed/blastp
    pepPredToFa dm1 bdgpGenePep bdgp.faa
    formatdb -i bdgp.faa -t bdgp -n bdgp

    # Copy over database to iscratch/i
    ssh kkr1u00
    if (-e /iscratch/i/dm1/blastp) then
      rm -r /iscratch/i/dm1/blastp
    endif
    mkdir -p /iscratch/i/dm1/blastp
    cp /cluster/data/dm1/bed/blastp/bdgp.* /iscratch/i/dm1/blastp

    # Load up iscratch/i with blastp and related files
    # if necessary
    if (! -e /iscratch/i/blast/blastall) then
      mkdir -p /iscratch/i/blast
      cp /projects/compbio/bin/i686/blastall /iscratch/i/blast
      mkdir -p /iscratch/i/blast/data
      cp /projects/compbio/bin/i686/data/* /iscratch/i/blast/data
    endif
    iSync

    # Split up fasta file into bite sized chunks for cluster
    ssh kksilo
    cd /cluster/data/dm1/bed/blastp
    mkdir split
    faSplit sequence bdgp.faa 6000 split/bg

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/self/run/out
    cd /cluster/data/dm1/bed/blastp/self/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/dm1/blastp/bdgp -i \$1 -o \$2 -e 0.01 -m 8 -b 1000
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest job:                      326s       5.43m     0.09h    0.00d
#Submission to last job:           655s      10.92m     0.18h    0.01d

    # Load into database.  This took only ~3 minutes for dm1.
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/self/run/out
    time hgLoadBlastTab dm1 bdgpBlastTab *.tab
    # Create table that maps between bdgp genes and RefSeq
    hgMapToGene dm1 refGene bdgpGene bdgpToRefSeq
    # Create table that maps between bdgp genes and LocusLink
    echo "select mrnaAcc,locusLinkId from refLink" | hgsql -N dm1 > refToLl.txt
### NOT DONE (LocusLink info temporarily missing from genbank)
    hgMapToGene dm1 refGene bdgpGene bdgpToLocusLink -lookup=refToLl.txt
###
    # Create table that maps between known genes and Pfam domains
    hgMapViaSwissProt dm1 bdgpGene name proteinID Pfam bdgpToPfam

    # Create a table that maps BDGP root names to canonical transcripts:
    cd /cluster/data/dm1/bed/blastp
    cat > bdgpToCanonical.sql <<end
CREATE TABLE bdgpToCanonical (
  name varchar(255) NOT NULL default '',
  value varchar(255) NOT NULL default '',
  KEY name (name(16)),
  KEY value (value(16))
) TYPE=MyISAM;
end
    perl -we 'open(P, "echo select transcript from bdgpCanonical | hgsql dm1 -N |") || die; \
              while (<P>) { \
                chop; $name = $value = $_; $value =~ s/-R.$//; \
                print "INSERT INTO bdgpToCanonical VALUES (\"$name\", \"$value\");\n"; \
              } \
              close(P);' \
      >> bdgpToCanonical.sql
    hgsql dm1 < bdgpToCanonical.sql
    # Create a table that maps between bdgp genes and the 
    # Stanford Microarray Project expression data. (see makeHgFixed.doc)
    hgExpDistance -lookup=bdgpToCanonical \
      dm1 hgFixed.arbFlyLifeMedianRatio dummyArg arbExpDistance

    # Make sure that GO database is up to date.
    See README in /cluster/store1/geneOntology.


# C.ELEGANS BLASTP FOR HGNEAR (REDONE 6/17/04 angie)
    # Make C. elegans ortholog column using blastp on wormpep.
    # First make C. elegans protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh eieio
    mkdir /cluster/data/ce2/bed/blastp
    cd /cluster/data/ce2/bed/blastp
    # Point a web browser at ftp://ftp.sanger.ac.uk/pub/databases/wormpep/
    # to find out the latest version.  Then use that in place of 126 below.
    wget -O wormPep126.faa \
      ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep126/wormpep
    formatdb -i wormPep126.faa -t wormPep126 -n wormPep126
    ssh kkr1u00
    if (-e /iscratch/i/ce2/blastp) then
      rm -r /iscratch/i/ce2/blastp
    endif
    mkdir -p /iscratch/i/ce2/blastp
    cp /cluster/data/ce2/bed/blastp/wormPep126.p?? /iscratch/i/ce2/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/ce2/run/out
    cd /cluster/data/dm1/bed/blastp/ce2/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/ce2/blastp/wormPep126 -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                   8s       0.14m     0.00h    0.00d
#Longest job:                      156s       2.60m     0.04h    0.00d
#Submission to last job:           156s       2.60m     0.04h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/ce2/run/out
    hgLoadBlastTab dm1 ceBlastTab -maxPer=1 *.tab


# MOUSE BLASTP FOR HGNEAR (REDONE 6/17/04 angie)
    # Make mouse ortholog column using blastp on mouse known genes.
    # First make mouse protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh hgwdev
    mkdir /cluster/data/mm4/bed/blastp
    cd /cluster/data/mm4/bed/blastp
    pepPredToFa mm4 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/mm4/blastp) then
      rm -r /iscratch/i/mm4/blastp
    endif
    mkdir -p /iscratch/i/mm4/blastp
    cp -p /cluster/data/mm4/bed/blastp/known.p?? /iscratch/i/mm4/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/mm4/run/out
    cd /cluster/data/dm1/bed/blastp/mm4/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/mm4/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest job:                      267s       4.45m     0.07h    0.00d
#Submission to last job:           267s       4.45m     0.07h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/mm4/run/out
    hgLoadBlastTab dm1 mmBlastTab -maxPer=1 *.tab


# HUMAN BLASTP FOR HGNEAR (REDONE 6/17/04 angie)
    # Make human ortholog column using blastp on human known genes.
    # First make human protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/hg16/bed/blastp
    cd /cluster/data/hg16/bed/blastp
    pepPredToFa hg16 knownGenePep known.faa
    formatdb -i known.faa -t known -n known
    ssh kkr1u00
    if (-e /iscratch/i/hg16/blastp) then
      rm -r /iscratch/i/hg16/blastp
    endif
    mkdir -p /iscratch/i/hg16/blastp
    cp /cluster/data/hg16/bed/blastp/known.p?? /iscratch/i/hg16/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/hg16/run/out
    cd /cluster/data/dm1/bed/blastp/hg16/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/hg16/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                  14s       0.24m     0.00h    0.00d
#Longest job:                      327s       5.45m     0.09h    0.00d
#Submission to last job:           327s       5.45m     0.09h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/hg16/run/out
    hgLoadBlastTab dm1 hgBlastTab -maxPer=1 *.tab


# ZEBRAFISH BLASTP FOR HGNEAR (REDONE 6/17/04 angie)
    # Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
    # First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    ssh kkstore
    mkdir /cluster/data/danRer1/bed/blastp
    cd /cluster/data/danRer1/bed/blastp
    wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH3.may.pep.fa.gz 
    zcat Dan*.pep.fa.gz > ensembl.faa
    formatdb -i ensembl.faa -t ensembl -n ensembl
    ssh kkr1u00
    if (-e /iscratch/i/danRer1/blastp) then
      rm -r /iscratch/i/danRer1/blastp
    endif
    mkdir -p /iscratch/i/danRer1/blastp
    cp /cluster/data/danRer1/bed/blastp/ensembl.p?? /iscratch/i/danRer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/danRer1/run/out
    cd /cluster/data/dm1/bed/blastp/danRer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/danRer1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest job:                      273s       4.55m     0.08h    0.00d
#Submission to last job:           273s       4.55m     0.08h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/danRer1/run/out
    hgLoadBlastTab dm1 drBlastTab -maxPer=1 *.tab


# YEAST BLASTP FOR HGNEAR (REDONE 6/17/04 angie)
    # Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on 
    # RefSeq.  First make protein database and copy it to iscratch/i
    # if it doesn't exist already:
    mkdir /cluster/data/sacCer1/bed/blastp
    cd /cluster/data/sacCer1/bed/blastp
    wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
    zcat orf_trans.fasta.gz > sgdPep.faa
    formatdb -i sgdPep.faa -t sgdPep -n sgdPep
    ssh kkr1u00
    # Note: sacCer1 is a name conflict with SARS coronavirus... oh well, 
    # fortunately we won't be looking for homologs there.  :)
    if (-e /iscratch/i/sacCer1/blastp) then
      rm -r /iscratch/i/sacCer1/blastp
    endif
    mkdir -p /iscratch/i/sacCer1/blastp
    cp /cluster/data/sacCer1/bed/blastp/sgdPep.p?? /iscratch/i/sacCer1/blastp
    iSync

    # Make parasol run directory 
    ssh kk
    mkdir -p /cluster/data/dm1/bed/blastp/sacCer1/run/out
    cd /cluster/data/dm1/bed/blastp/sacCer1/run
    # Make blast script
    cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /iscratch/i/blast/data
/iscratch/i/blast/blastall -p blastp -d /iscratch/i/sacCer1/blastp/sgdPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
    chmod a+x blastSome
    # Make gensub2 file
    cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end
    # Create parasol batch
    ls -1S ../../split/*.fa > split.lst
    gensub2 split.lst single gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 5821 of 5821 jobs
#Average job time:                   4s       0.07m     0.00h    0.00d
#Longest job:                       37s       0.62m     0.01h    0.00d
#Submission to last job:            42s       0.70m     0.01h    0.00d

    # Load into database.  
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastp/sacCer1/run/out
    hgLoadBlastTab dm1 scBlastTab -maxPer=1 *.tab


# MAKE ORGANISM-SPECIFIC HGNEARDATA FILES (DONE 10/17/03 angie)
    cd ~/kent/src/hg/near/hgNear/hgNearData
    # The directory name is the dbDb.genome field, processed by 
    # hdb.c's hgDirForOrg():
    mkdir D_melanogaster
    cp C_elegans/*.{html,ra} D_melanogaster/
    cd D_melanogaster
    mv kimLifeCycleFull.html arbLifeCycleFull.html
    mv kimLifeCycleMedian.html arbLifeCycleMedian.html
    # edit all .ra and .html files as appropriate for D. melanogaster/dm1...
    # cvs add and check in D_melanogaster/ and all .ra and .html files.
    # The "representatives" lines in columnDb.ra are tricky.  
    # For the median reps, I used this, edited to include an extra "-1," 
    # at each boundary between evelopmental stages (embryo -> larva, etc):
    # 29->30, 41->42, 59->60
    echo "select id from arbFlyLifeMedianExps;" | hgsql -N hgFixed \
    | perl -we '$i=0; \
                while (<>) { \
                  chop; print "$_,"; \
                  $i++; if (($i % 5) == 0) { print "-1," }; \
                } \
                print "\n";'
    # For the full reps, I used the output of this command minus the 
    # initial ",-1,".  -1 separators are inserted between each group 
    # that was lumped together in arbMed.ra:
    awk '-F      ' '{print $3;}' \
      ~/kent/src/hg/makeDb/hgMedianMicroarray/arbMed.ra \
    | perl -we 'while (<>) { \
                  @w=split(" "); \
                   print ",-1," . join(",", @w); \
                } \
                print "\n";'
    # Figure out what values to use for expMulti defn's absoluteMax.
    # Report on the max and 99.5%ile absolute score:
    echo select expScores from arbFlyLifeAll | hgsql hgFixed -N \
    | perl -we '$max = 0.0; \
                @all = (); \
                while (<>) { \
                  chop; @nums = split(","); \
                  foreach $num (@nums) { \
                    $max = $num if ($num > $max); \
                    push @all, $num if (defined $num && $num ne ""); \
                  } \
                } \
                $n = scalar(@all); \
                print "max is $max, N is $n\n"; \
                @all = sort { $a <=> $b } @all; \
                $useMax = $all[$n * 0.995 + 1]; \
                print "99.5%ile is $useMax -- use this for max\n";'
# max is 65535.000, N is 797364
# 99.5%ile is 15433.000 -- use this for max
    # N is the product of count(*) of arbFlyLifeAll{,Exps} .  
    # Now repeat the above perl in-liner, but on arbFlyLifeAllRatio, 
    # to determine ratioMax:  
    echo select expScores from arbFlyLifeAllRatio | hgsql hgFixed -N \
    # <paste in perl from above>
# max is 9.862, N is 797364
# 99.5%ile is 2.915 -- use this for max


# ENABLE HGNEAR FOR DM1 IN HGCENTRALTEST (DONE 10/17/03 angie)
    echo "update dbDb set hgNearOk = 1 where name = 'dm1';" \
      | hgsql -h genome-testdb hgcentraltest


# END OF HGNEAR STUFF
#############################################################################


# reload refseqs to pickup a change that uses locus_tag if gene name isn't
# available.  2003/10/16 markd

# First remove refseqs from databases:
    drop table refSeqStatus;
    drop table refLink;
    drop table refSeqSummary;
    drop table refGene;
    drop table refSeqAli;
    drop table refFlat;

    delete from gbSeq where srcDb = 'RefSeq';
    delete from gbStatus where srcDb = 'RefSeq';
    delete from gbExtFile where path like '%/refseq%';
    delete from gbLoaded where srcDb = 'RefSeq';
    delete from mrna where acc like 'NM\_%';
    delete from imageClone where acc like 'NM\_%';
    delete from mrna where acc like 'NR\_%';
    delete from imageClone where acc like 'NR\_%';

# now reload:
    cd /cluster/data/genbank
    ./bin/gbDbLoadStep -type=mrna dm1


# MAKE DOWNLOADABLE FILES (DONE 10/29/03 angie)
    ssh kksilo
    cd /cluster/data/dm1
    mkdir zips
    zip -j zips/chromOut.zip ?{,?}/chr?{,?}.fa.out
    zip -j zips/chromFa.zip ?{,?}/chr?{,?}.fa
    foreach f (?{,?}/chr?{,?}.fa)
      maskOutFa $f hard $f.masked
    end
    zip -j zips/chromFaMasked.zip ?{,?}/chr?{,?}.fa.masked
    cd bed/simpleRepeat
    zip ../../zips/chromTrf.zip trfMaskChrom/chr*.bed
    zip ../../zips/contigTrf.zip trfMask/N{T,G}*.bed
    cd ../..
    # Make a starter mrna.zip -- it will get updated regularly on the RR. 
    /cluster/data/genbank/bin/i386/gbGetSeqs -gbRoot=/cluster/data/genbank \
      -db=dm1 -native genbank mrna mrna.fa
    zip zips/mrna.zip mrna.fa
    rm mrna.fa
    foreach f (zips/*.zip)
      echo $f
      unzip -t $f | tail -1
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1
    cd /usr/local/apache/htdocs/goldenPath/dm1
    mkdir bigZips database
    # Create README.txt files in bigZips/ and database/ to explain the files.
    cp -p /cluster/data/dm1/zips/*.zip bigZips


# ANOPHELES ECORES FROM GENOSCOPE (DONE 11/10/03 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/anophelesEcores
    cd /cluster/data/dm1/bed/anophelesEcores
    # save attachment from Olivier Jaillon's email 11/10/03 to 
    # ecotig.6.4ucsc
    perl -wpe 'if (/^(\w+)\:\d+ (\d+) (\d+) (\w+)\:\d+ (\d+) (\d+)$/) { \
                 if ($1 ne $4) { die "diff chr: $1 $4"; } \
                 $name = "chr$1:$2-$6";  $start = $2 - 1; \
                 $sz1 = $3 - $start;  $sz2 = $6 - ($5 - 1);   $st2 = $5 - $2; \
                 $_ = "chr$1\t$start\t$6\t$name\t0\t+\t$start\t$6\t0\t2\t$sz1,$sz2,\t0,$st2,\n"; \
               } elsif (/^(\w+)\:\d+ (\d+) (\d+)\s*$/) { \
                 $name = "chr$1:$2-$3";  $start = $2 - 1;  $sz1 = $3 - $start; \
                 $_ = "chr$1\t$start\t$3\t$name\t0\t+\t$start\t$3\t0\t1\t$sz1,\t0,\n"; \
               } else { chop; die "cant parse line $.:\n|$_|"; }' \
    < ecotig.6.4ucsc > anophelesEcores.bed
    hgLoadBed -tab dm1 anophelesEcores anophelesEcores.bed


# MASKED-QUERY XENO RNA ALIGNMENTS (DONE 10/22/03 angie)
    # Experimental track... if this TRF-masking works out, then find out 
    # what it would take to add it as an option to Mark's genbank stuff.
    ssh kksilo
    mkdir /cluster/data/dm1/bed/xenoMrnaMasked
    cd /cluster/data/dm1/bed/xenoMrnaMasked
    # Grab the latest full-release (137) genbank mRNA's.  Strip the 
    # .version suffixes so we can use accessions from mrna.gbidx to 
    # pick out the non-D.mel. mrnas.  
    perl -wpe 's/^>(\w+).\d+/>$1/' \
      /cluster/data/genbank/data/processed/genbank.137.0/full/mrna.fa \
      > allMrna.fa
    grep -v "Drosophila melanogaster" \
      /cluster/data/genbank/data/processed/genbank.137.0/full/mrna.gbidx \
        | awk '{print $1;}' | grep -v "^#" \
    > xenoAcc.lst
    faSomeRecords allMrna.fa xenoAcc.lst flyXenoRna.fa
    # Split up the sequences into manageably sized files.  
    mkdir flyXenoRnaSplit
    faSplit about flyXenoRna.fa 10000000 flyXenoRnaSplit/xenoRna
    # Now TRF-mask the sequences... use repeats with period <= 9 to mask.
    mkdir trf
    rm -f trf.log; touch trf.log
    foreach f (flyXenoRnaSplit/*.fa)
      set b = trf/$f:t:r.bed
      echo $f to $b...
      /cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null \
        -bedAt=$b -tempDir=/tmp -maxPeriod=9 >& trf.log
      maskOutFa -soft $f $b $f
    end
    # distribute masked xenoRna sequence on i-servers
    mkdir /cluster/bluearc/dm1/mrna.137
    cp -p /cluster/data/dm1/bed/xenoMrnaMasked/flyXenoRnaSplit/*.fa \
      /cluster/bluearc/dm1/mrna.137/

    ssh kk
    cd /cluster/data/dm1/bed/xenoMrnaMasked
    mkdir psl
    ls -1S /iscratch/i/dm1/nib/chr*.nib > fly.lst
    ls -1S /cluster/bluearc/dm1/mrna.137/xenoRna*.fa > mrna.lst
    echo '#LOOP \
/cluster/bin/i386/blat -maxIntron=50000 $(path1) {check in line+ $(path2)} -q=rnax -t=dnax -mask=lower {check out line+ psl/$(root1)_$(root2).psl} \
#ENDLOOP' > gsub
    gensub2 fly.lst mrna.lst gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 814 of 814 jobs
#Average job time:                 459s       7.65m     0.13h    0.01d
#Longest job:                     1756s      29.27m     0.49h    0.02d
#Submission to last job:          2145s      35.75m     0.60h    0.02d
    ssh kksilo
    cd /cluster/data/dm1/bed/xenoMrnaMasked
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps raw.psl cooked.psl /dev/null -minAli=0.25
    # pslFilter -minMatch=60 -gapSizeLogMod=2 -minScore=30 cooked.psl filt.psl
    pslFilter -minAli=250 -minUniqueMatch=15 cooked.psl filt.psl
    pslSortAcc nohead chrom /cluster/store2/temp filt.psl
    pslCat -dir chrom > xenoMrnaMasked.psl
    rm -r chrom raw.psl cooked.psl filt.psl
    # Load into database as so:
    ssh hgwdev
    cd /cluster/data/dm1/bed/xenoMrnaMasked
    hgLoadPsl dm1 xenoMrnaMasked.psl
    # Looks like no need to hgLoadSeq -- seqs are already loaded by Mark's 
    # stuff.


# BLAT HONEYBEE (apiMel0) (DONE 1/13/04 angie)
    ssh kk
    mkdir /cluster/data/dm1/bed/blatApiMel0
    cd /cluster/data/dm1/bed/blatApiMel0
    mkdir psl
    ls -1S /iscratch/i/dm1/nib/*.nib > fly.lst
    ls -1S /iscratch/i/apiMel0/chunks/*.fa > bee.lst
cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/dm1/bed/blatApiMel0/psl/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'
    # << this line makes emacs coloring happy
    gensub2 fly.lst bee.lst gsub spec
    para create spec
    para try, check, push, check, ...
#Completed: 495 of 495 jobs
#Average job time:                 802s      13.37m     0.22h    0.01d
#Longest job:                    26590s     443.17m     7.39h    0.31d
#Submission to last job:         27028s     450.47m     7.51h    0.31d
    # postprocess
    pslCat -dir psl > blatApiMel0.psl
    # load
    ssh hgwdev
    cd /cluster/data/dm1/bed/blatApiMel0
    hgLoadPsl dm1 blatApiMel0.psl
    mkdir /gbdb/dm1/apiMel0
    foreach f (/cluster/data/apiMel0/groups/*.fa)
      ln -s $f /gbdb/dm1/apiMel0/$f:t
    end
    hgLoadSeq dm1 /gbdb/dm1/apiMel0/*.fa


#  miRNA track (DONE - 2004-05-04 - Hiram)
    #	data from: Sam Griffiths-Jones <sgj@sanger.ac.uk>
    #	and Michel.Weber@ibcg.biotoul.fr
    #	notify them if this assembly updates to renew this track
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/miRNA
    cd /cluster/data/dm1/bed/miRNA
    wget --timestamping \
    "ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/dme_bdgp3.*"
    grep -v "^track " dme_bdgp3.bed | sed -e "s/ /\t/g" > dm1.bed
    hgLoadBed dm1 miRNA dm1.bed
    # entry in trackDb/trackDb.ra already there
    #	featureBits dm1 miRNA
    #	6845 bases of 126527731 (0.005%) in intersection


# BLASTZ D.YAKUBA (DONE 5/22/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.yakuba
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/cluster/bluearc/drosophila/dm1/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. yakuba
SEQ2_DIR=/iscratch/i/droYak1/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.droYak1.2004-05-22

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/home/angie/schwartzbin/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run
    para create j
    para try, check, push, check, ....
#Completed: 672 of 672 jobs
#Average job time:                 164s       2.74m     0.05h    0.00d
#Longest job:                     1836s      30.60m     0.51h    0.02d
#Submission to last job:          2433s      40.55m     0.68h    0.03d

    # back on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1
    ~angie/hummus/do.out2lav ../DEF > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                 453s       7.54m     0.13h    0.01d
#Longest job:                      650s      10.83m     0.18h    0.01d
#Submission to last job:           685s      11.42m     0.19h    0.01d
    cd ..
    rm -r raw

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /cluster/bluearc/drosophila/dm1/nib /iscratch/i/droYak1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 11 of 11 jobs
#Average job time:                1516s      25.27m     0.42h    0.02d
#Longest job:                     2903s      48.38m     0.81h    0.03d
#Submission to last job:          2903s      48.38m     0.81h    0.03d

# CHAIN YAKUBA BLASTZ (DONE 5/27/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn_random $1 \
| axtChain -verbose=0 stdin \
  /cluster/bluearc/drosophila/dm1/nib \
  /iscratch/i/droYak1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  18s       0.30m     0.01h    0.00d
#Longest job:                       35s       0.58m     0.01h    0.00d
#Submission to last job:            35s       0.58m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDroYak1 $i
    end


# NET YAKUBA BLASTZ (DONE 5/27/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    netClass -noAr noClass.net dm1 droYak1 yakuba.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    rm noClass.net
    netFilter -syn yakuba.net > yakubaSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    netFilter -minGap=10 yakuba.net |  hgLoadNet dm1 netDroYak1 stdin
    netFilter -minGap=10 yakubaSyn.net | hgLoadNet dm1 netSyntenyDroYak1 stdin


# MAKE VSDROYAK1 DOWNLOADABLES (DONE 10/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    gzip -c all.chain > ../../../zips/yakuba.chain.gz
    gzip -c yakuba.net > ../../../zips/yakuba.net.gz
    # axtNet/* already compressed...

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroYak1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroYak1
    mv /cluster/data/dm1/zips/yakuba*.gz .
    cp -pR /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# GENERATE DROYAK1 MAF FOR MULTIZ FROM NET (DONE 5/27/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/axtChain
    netSplit yakuba.net net
    ssh kolossus
    cd /cluster/data/dm1/bed/blastz.droYak1.2004-05-22
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/dm1/nib \
        /cluster/data/droYak1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end

    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.my.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/droYak1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=droYak1.
    end


# GENERATE DP2 MAF FOR MULTIZ FROM NET (DONE 8/9/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp2.2004-08-03/axtChain
    netSplit pseudoobscura.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/dp2/nib stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/dp2/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=dp2.
    end


# MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA (DONE 8/9/04 angie)
    # put the MAFs on bluearc
    ssh kksilo
    mkdir -p /cluster/bluearc/multiz.fly/my
    cp /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \
      /cluster/bluearc/multiz.fly/my
    mkdir -p /cluster/bluearc/multiz.fly/mp
    cp /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \
      /cluster/bluearc/multiz.fly/mp

    ssh kki
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2
    mkdir myp
    # Wrapper script required because of stdout redirect:
    cat << '_EOF_' > doMultiz
#!/bin/csh -ef
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.fly/my/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz /cluster/bluearc/multiz.fly/mp/${root}.maf $file /cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp/${root}.maf" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 11 of 11 jobs
#Average job time:                  52s       0.87m     0.01h    0.00d
#Longest job:                      144s       2.40m     0.04h    0.00d
#Submission to last job:           144s       2.40m     0.04h    0.00d

    # clean up bluearc (these are big files!)
    rm -r /cluster/bluearc/multiz.fly

    # no downloadables/db-loading -- just go on and do the 4-way multiz.


# BDGP 3.2 ANNOTATIONS (DONE 6/18/04 angie)
    ss kksilo
    cd /cluster/data/dm1/bed
    mv bdgpAnnotations bdgpAnnotations.3.1
    mkdir bdgpAnnotations.3.2
    cd /cluster/data/dm1/bed/bdgpAnnotations.3.2
    # No annotations for chrU or h's!  probably better that way...
    foreach c (2L 2R 3L 3R 4 X)
      set f = dmel_${c}_translation_r3.2.0.fasta.gz
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/fasta/$f
      set f = dmel_${c}_r3.2.0.gff.gz
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/gff/$f
    end
    gunzip dmel*.gff.gz
    # This new set is almost-but-not-quite GFF3, and all features (genes,
    # pseudogenes, ESTs, regulatory regions, kitchen sink) are included 
    # in one /gff file.  Perl the gene and non-coding gene parts of it into 
    # bdgp{Gene,NonCoding}.gtf and bdgp{Gene,NonCoding}Info.tab.
    chmod a+x ./extractGenes.pl
    ./extractGenes.pl *.gff
    # Sort GTF by position.
    sort -k1,1 -k4n,5n bdgpGene.gtf > tmp
    mv tmp bdgpGene.gtf
    sort -k1,1 -k4n,5n bdgpNonCoding.gtf > tmp
    mv tmp bdgpNonCoding.gtf

    # Proteins:
    zcat dmel*trans*.gz | perl -wpe 's/^>(CG\d+)(-\S+)?-P(\w)\s.*/>$1-R$3/' \
    > bdgpGenePep.fa

    # Load into test table for now -- don't want to mess up Gene Sorter 
    # stuff that depends on bdgpGene* tables.
    ssh hgwdev
    cd /cluster/data/dm1/bed/bdgpAnnotations.3.2
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf dm1 bdgpGene2 bdgpGene.gtf
    featureBits dm1 bdgpGene
#28262131 bases of 126527731 (22.337%) in intersection
    featureBits dm1 bdgpGene2
#28178241 bases of 126527731 (22.270%) in intersection
    featureBits dm1 bdgpGene bdgpGene2
#27813359 bases of 126527731 (21.982%) in intersection
    # OK, looks OK.  Replace bdgpGene with bdgpGene2, reload bdgpGeneInfo,
    # then go back up and quickly rebuild all the hgNear (Gene Sorter) 
    # tables that depend on bdgpGene.  
    hgsql dm1 -e 'drop table bdgpGene; alter table bdgpGene2 rename bdgpGene'
    hgsql dm1 -e 'delete from bdgpGeneInfo'
    hgsql dm1 -e 'load data local infile "bdgpGeneInfo.tab" into table bdgpGeneInfo'
    # load bdgpNonCoding*
    # Reloaded without -genePredExt 1/6/05:
    ldHgGene -gtf dm1 bdgpNonCoding bdgpNonCoding.gtf
    hgsql dm1 -e 'delete from bdgpNonCodingInfo'
    hgsql dm1 -e 'load data local infile "bdgpNonCodingInfo.tab" into table bdgpNonCodingInfo'
    # load proteins
    hgPepPred dm1 generic bdgpGenePep bdgpGenePep.fa


# CYTOBANDS (DONE 6/15/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/cytoband
    cd /cluster/data/dm1/bed/cytoband
    foreach c (2L 2R 3L 3R 4 X)
      wget ftp://flybase.net/genomes/Drosophila_melanogaster/current/gnomap/cytomap-$c.tsv
    end
    cp /dev/null cytoBand.tab
    foreach c (2L 2R 3L 3R 4 X)
      grep -v '^#' cytomap-$c.tsv \
      | awk '$3 > 0 && $2 != $3 {print "chr'$c'\t" $2 "\t" $3 "\t" $1 "\t";}' \
      >> cytoBand.tab
    end
    hgsql dm1 < ~/kent/src/hg/lib/cytoBand.sql
    hgsql dm1 -e 'load data local infile "cytoBand.tab" into table cytoBand'
    # Normally we would just create cytoBandIdeo by select * from cytoBand --
    # but that gives too high of a resolution for any bands to be visible!  
    # So make a boiled-down cytoBandIdeo that just has {number, letter} 
    # instead of {number, letter, number}.
    perl -we 'while (<>) { \
                chomp; @w=split(" "); \
                if ($w[3] =~ /(\d+[A-Z]+)\d+/) { \
                  $b = $1; \
                } else { die "doh!" } \
                if (! defined $lastB) { \
                  $start = $w[1]; \
                } elsif ($lastB ne $b) { \
                  print "$chrom\t$start\t$end\t$lastB\n"; \
                  $start = $w[1]; \
                } \
                ($chrom, $end, $lastB) = ($w[0], $w[2], $b); \
              } \
              print "$chrom\t$start\t$end\t$lastB\n";' cytoBand.tab \
     > cytoBandIdeo.tab
    hgsql dm1 -e 'create table cytoBandIdeo select * from cytoBand where 0'
    hgsql dm1 -e 'load data local infile "cytoBandIdeo.tab" into table \
                  cytoBandIdeo'
    # 6/23/04: trim the entries that fall off the ends of their respective 
    # chromosomes.
    foreach c (`echo select chrom from chromInfo | hgsql -N dm1`)
      set size = `echo select size from chromInfo where chrom = '"'$c'"' \
                  | hgsql -N dm1`
      echo $c $size
      foreach table (cytoBand cytoBandIdeo)
        echo delete from $table where chrom = \"$c\" and chromStart \> $size \
        | hgsql -N dm1
        echo update $table set chromEnd = $size where chrom = \"$c\" and \
             chromEnd \> $size \
        | hgsql -N dm1
      end
    end


# BLASTZ ANOPHELES (DONE 6/10/04 angie)
    # Will give human-fugu params a try... but without abridging repeats 
    # since I don't know which are lin-spec for fly vs. mosquito, and don't 
    # want to bother Arian or speculate.
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    cat << '_EOF_' > DEF
# D.melanogaster vs. A. gambiae
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/cluster/bluearc/drosophila/dm1/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - A. gambiae
SEQ2_DIR=/cluster/bluearc/anoGam1/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-anopheles
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.anoGam1.2004-06-10

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run -- use rack 9 to avoid getting in the way of hg17.
    ssh kk9
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run
    para create j
    para try, check, push, check, ....
#Completed: 693 of 693 jobs
#Average job time:                 208s       3.47m     0.06h    0.00d
#Longest job:                      675s      11.25m     0.19h    0.01d
#Submission to last job:          1884s      31.40m     0.52h    0.02d

    # back on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1
    ~angie/hummus/do.out2lav ../DEF > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest job:                       12s       0.20m     0.00h    0.00d
#Submission to last job:            14s       0.23m     0.00h    0.00d
    cd ..
    rm -r raw

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    mkdir axtChrom pslChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| $HOME/bin/x86_64/lavToAxt stdin \
    /cluster/bluearc/drosophila/dm1/nib /iscratch/i/anoGam1/nib stdout \
| $HOME/bin/x86_64/axtSort stdin ../../axtChrom/$chr.axt 
$HOME/bin/x86_64/axtToPsl ../../axtChrom/$chr.axt ../../S1.len ../../S2.len \
  ../../pslChrom/$chr.psl
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 11 of 11 jobs
#Average job time:                   9s       0.16m     0.00h    0.00d
#Longest job:                       18s       0.30m     0.01h    0.00d
#Submission to last job:            18s       0.30m     0.01h    0.00d


# CHAIN ANOPHELES BLASTZ (DONE 6/10/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    # Make our own linear gap file with reduced gap penalties:
    cat << '_EOF_' > ../../chickenHumanTuned.gap
tablesize	11
smallSize	111
position	1	2	3	11	111	2111	12111	32111	72111	152111	252111
qGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
tGap	325	360	400	450	600	1100	3600	7600	15600	31600	56600
bothGap	625	660	700	750	900	1400	4000	8000	16000	32000	57000
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=../../chickenHumanTuned.gap \
         -verbose=0 $1 \
  /cluster/bluearc/drosophila/dm1/nib \
  /iscratch/i/anoGam1/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  13s       0.22m     0.00h    0.00d
#Longest job:                       19s       0.32m     0.01h    0.00d
#Submission to last job:            19s       0.32m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainAnoGam1 $i
    end


# NET ANOPHELES BLASTZ (DONE 6/10/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    netClass -noAr noClass.net dm1 anoGam1 anopheles.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    rm noClass.net
    netFilter -syn anopheles.net > anophelesSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    netFilter -minGap=10 anopheles.net |  hgLoadNet dm1 netAnoGam1 stdin
    netFilter -minGap=10 anophelesSyn.net | hgLoadNet dm1 netSyntenyAnoGam1 stdin


# MAKE VSANOGAM1 DOWNLOADABLES (DONE 10/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    gzip -c all.chain > ../../../zips/anopheles.chain.gz
    gzip -c anopheles.net > ../../../zips/anopheles.net.gz
    cd ..
    mkdir ../../zips/axtNet
    foreach f (axtNet/chr*.axt)
      gzip -c $f > ../../zips/axtNet/$f:t.gz
    end

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsAnoGam1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsAnoGam1
    mv /cluster/data/dm1/zips/anopheles*.gz .
    mv /cluster/data/dm1/zips/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# GENERATE ANOGAM1 MAF FOR MULTIZ FROM NET (DONE 6/10/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/axtChain
    netSplit anopheles.net net
    ssh kolossus
    cd /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/dm1/nib \
        /cluster/data/anoGam1/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end

    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.ma.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/anoGam1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=anoGam1.
    end


# MULTIZ MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 8/9/04 angie)
    # put the MAFs on bluearc
    ssh kksilo
    mkdir -p /cluster/bluearc/multiz.flymo/myp
    cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp/*.maf \
      /cluster/bluearc/multiz.flymo/myp
    mkdir -p /cluster/bluearc/multiz.flymo/ma
    cp /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \
      /cluster/bluearc/multiz.flymo/ma

    ssh kki
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1
    mkdir mypa
    # Wrapper script required because of stdout redirect:
    cat << '_EOF_' > doMultiz
#!/bin/csh -fe
/cluster/bin/penn/multiz $1 $2 - > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz
    rm -f jobList
    foreach file (/cluster/bluearc/multiz.flymo/myp/*.maf) 
      set root=$file:t:r:r
      echo "doMultiz /cluster/bluearc/multiz.flymo/ma/${root}.ma.maf $file /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/${root}.maf" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 11 of 11 jobs
#Average job time:                  27s       0.45m     0.01h    0.00d
#Longest job:                       74s       1.23m     0.02h    0.00d
#Submission to last job:            74s       1.23m     0.02h    0.00d
    du -sh mypa
#415M    mypa

    # clean up bluearc
    rm -r /cluster/bluearc/multiz.flymo

    # setup external files for database reference
    ssh hgwdev
    mkdir /gbdb/dm1/mzDy1Dp2Ag1_phast
    ln -s /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/chr*.maf \
      /gbdb/dm1/mzDy1Dp2Ag1_phast/
    # load into database
    hgLoadMaf -warn dm1 mzDy1Dp2Ag1_phast
    # The previous Conservation wigMaf had "pairwise mypa" in its trackDb.
    # Now we need to define a new suffix for the pairwise tables and 
    # /gbdb/ locations, in order to avoid a clash between dp1 and dp2 
    # d_pseudoobscura_$pairwise tables/gbdb.  
    mkdir /gbdb/dm1/{d_yakuba,d_pseudoobscura,a_gambiae}_myp2a
    cd /tmp
    ln -s /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \
      /gbdb/dm1/d_yakuba_myp2a
    hgLoadMaf -WARN dm1 d_yakuba_myp2a
    ln -s /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \
      /gbdb/dm1/d_pseudoobscura_myp2a
    hgLoadMaf -WARN dm1 d_pseudoobscura_myp2a
    ln -s /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \
      /gbdb/dm1/a_gambiae_myp2a
    hgLoadMaf -WARN dm1 a_gambiae_myp2a

    # put it out for download (10/5/04)
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1
    mkdir ../../zips/mzMafs
    foreach f (mypa/*.maf)
      gzip -c $f > ../../zips/mzMafs/$f:t.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/multizDroYak1Dp2AnoGam1
    cd /usr/local/apache/htdocs/goldenPath/dm1/multizDroYak1Dp2AnoGam1
    mv /cluster/data/dm1/zips/mzMafs/* .
    rmdir /cluster/data/dm1/zips/mzMafs
    md5sum *.gz > md5sum.txt


# PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA (first cut 6/1/04 acs)

    # make sure /cluster/bin/phast is in your path
    # commands below use bash shell
    # watch for typos -- copying from a Makefile and editing a bit

    # there are three steps to go through: fitting a phylogenetic
    # model to the data set with phyloFit, then running phastCons for
    # conservation scores, then running phastCons for predictions of
    # conserved elements.

    # step 1: fit phylogenetic model, with rate variation (discrete
    # gamma model) first we need to extract the sufficient stats for
    # the phylogenetic model from the MAFs.  In this case, we don't
    # care about the order of the tuples (allows for much more compact
    # representation)

    ssh eieio
    mkdir /cluster/data/dm1/bed/phastCons
    cd /cluster/data/dm1/bed/phastCons
    MAF=/cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp
    # first extract chromosome by chromosome
    for file in ${MAF}/*.maf ; do \
	msa_view $file --in-format MAF --out-format SS --order dm1,droYak1,dp2 \
	    --unordered-ss  > `basename $file .maf`.ss ;\
    done
    # now aggregate for whole genome
    ls chr*.ss > fnames
    msa_view "*fnames" -i SS --aggregate dm1,droYak1,dp2 -o SS --unordered-ss > all.ss

    # now estimate parameters; should be very fast in this case
    phyloFit --msa all.ss --tree "((1,2),3)" --msa-format SS --subst-mod REV --nrates 10 --out-root rev-dg --output-tree

    # make sure model looks reasonable
    cat rev-dg.mod
#ALPHABET: A C G T
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 1.273963
#TRAINING_LNL: -290963642.026431
#BACKGROUND: 0.279634 0.220483 0.220439 0.279444
#RATE_MAT:
#  -0.929128    0.196259    0.495954    0.236916
#   0.248910   -1.089402    0.212511    0.627981
#   0.629134    0.212554   -1.090600    0.248913
#   0.237077    0.495481    0.196354   -0.928911
#TREE: ((1:0.057640,2:0.073631):0.166542,3:0.166542);

    # beware of zero branch lengths (indicates bad topology) or very
    # large branch lengths (much greater than one; indicates
    # convergence problems).  Can try without --nrates or with
    # --subst-mod HKY85 to double check (parameter estimates should be
    # in the same ballpark, REV + dG likelihood should be a bit
    # better).  Also can write a log file to monitor convergence
    # (--log).  Also watch out for alpha much less than one.
    # Sometimes it's helpful to produce a rendering of the tree using
    # draw_tree (run on the *.nh file)

    # step 2: run phastCons.  First partition the alignments into
    # bite-sized chunks.  This time we need the ordered version of the
    # SS format.

    # some vars used below
    MAF=/cluster/data/dm1/bed/multiz.dm1droYak1dp2/myp
    FA=/cluster/data/dm1
    WINSIZE=1000000
    WINOVERLAP=0

    for file in ${MAF}/*.maf ; do \
	root=`basename $file .maf` ;\
	chr=`echo $root | sed 's/chr//'` ;\
	echo $file $root $chr ;\
	mkdir -p SS/$chr ;\
	msa_split $file -i MAF -o SS -O dm1,droYak1,dp2 -M ${FA}/$chr/$root.fa \
	    -w ${WINSIZE},${WINOVERLAP} -r SS/$chr/$$root -I 1000 -d 1 -B 5000 ;\
    done
    # (this is worth doing as a little cluster job with mammalian genomes,
    # but it's pretty fast with fly)

    ssh hgwdev
    cd /cluster/data/dm1/bed/phastCons

    # now set up cluster job.  Make a little wrapper for phastCons
    cat << '_EOF_' > doPostProbs
#!/bin/sh

PHAST=/cluster/bin/phast
TMP=/scratch/phastCons

file=$1
root=`basename $file .ss`
chrom=`echo $root | awk -F\. '{print $1}'`

mkdir -p $TMP
$PHAST/phastCons $file rev-dg.mod --cut-at 2 --nrates 10 --transitions 0.030,0.015 --quiet > ${TMP}/$root.pp
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz
rm $TMP/$root.pp
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doPostProbs

    # Note: the --cut-params arguments above are approximate
    # likelihood estimates obtained by running phastCons *without* the
    # --cut-params argument on four or five different windows (all
    # gave similar results).  They may need to change for different
    # data sets.  Careful, though: the parameter estimation procedure
    # is a little unstable.  Be sure to create a log file and monitor
    # for convergence.

    # 2nd note: manually tried various parameter settings and settled
    # on --cut-params 0.010,0.005.  Seems to give best results for
    # both post probs and viterbi path.  At least until we settle on a
    # better principle for setting the params, these are the
    # recommended ones to use.

    # set up a jobs list
    rm -f jobs.lst
    for file in `find SS -name "*.ss"` ; do echo doPostProbs $file >> jobs.lst ; done

    # run cluster job
    ssh kk
    cd /cluster/data/dm1/bed/phastCons
    para create jobs.lst ; para try ; para push ; etc....
#Completed: 134 of 134 jobs
#CPU time in finished jobs:       2584s      43.07m     0.72h    0.03d  0.000 y
#IO & Wait Time:                  2090s      34.83m     0.58h    0.02d  0.000 y
#Average job time:                  35s       0.58m     0.01h    0.00d
#Longest job:                       46s       0.77m     0.01h    0.00d
#Submission to last job:            62s       1.03m     0.02h    0.00d
    logout

    # now create wiggle track
    # NOTE: might want to integrate with the multiz track instead of
    # keeping separate
    mkdir -p wib
    for dir in POSTPROBS/* ; do \
	echo $dir ;\
	chr=`basename $dir` ;\
	zcat `ls $dir/*.pp.gz | sort -t\. -k2,2n` | \
	    wigAsciiToBinary -chrom=$chr \
	    -wibFile=wib/${chr}_phastCons stdin ;\
    done
    hgLoadWiggle dm1 phastCons wib/chr*_phastCons.wig
    mkdir -p /gbdb/dm1/wib
    rm -f /gbdb/dm1/wib/*phastCons.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/wib
    chmod 775 . wib
    chmod 664 wib/*.wib

    # trackDb.ra entry
#track phastCons
#shortLabel phastCons 
#longLabel phastCons Conservation Score, melanogaster/yakuba/pseudoobscura
#group compGeno
#priority 103
#visibility hide
#color 0,10,100
#maxHeightPixels 40
#type wig 0.0 1.0
#autoScale off

    # step 3:  predictions of conserved elements
    # (could do these at the same time as step 2, but we want to use
    # different --rates-cut params)
    cat << '_EOF_' > doViterbi
#!/bin/sh

PHAST=/cluster/home/acs/phast/bin
TMP=/scratch/phastCons

file=$1
root=`basename $file .ss`
chrom=`echo $root | awk -F\. '{print $1}'`

mkdir -p PREDICTIONS/$chrom
$PHAST/phastCons $file rev-dg.mod --nrates 10 --viterbi PREDICTIONS/$chrom/$root.bed --score --no-post-probs --transitions 0.030,0.015 --quiet --seqname $chrom
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doViterbi

    # see note above regarding --cut-params (0.010,0.005 recommended)

    rm -f jobs.viterbi.lst
    for file in `find SS -name "*.ss"` ; do echo doViterbi $file >> jobs.viterbi.lst ; done
    logout

    ssh kk
    cd /cluster/data/dm1/bed/phastCons
    para create jobs.viterbi.lst ; para try ; para push ; etc....
#CPU time in finished jobs:        320s       5.34m     0.09h    0.00d  0.000 y
#IO & Wait Time:                   427s       7.11m     0.12h    0.00d  0.000 y
#Average job time:                   6s       0.09m     0.00h    0.00d
#Longest job:                       10s       0.17m     0.00h    0.00d
#Submission to last job:            52s       0.87m     0.01h    0.00d
    logout

    # create track; we want to tweak the scores and the names
    sed 's/id //' PREDICTIONS/*/*.bed | \
	awk '{printf "%s\t%s\t%s\tlod=%d\t%d\t%s\t%s\t%s\t%s\t%s\t%s\t%s\n", \
	    $1, $2, $3, $5, 221.65 * log($5) - 352.64, $6, $7, $8, $9, \
	    $10, $11, $12}' > all.bed
    hgLoadBed dm1 phastConsElements all.bed

    # Scores are transformed as follows, for a reasonable-looking 
    # "spectrum".  Let x_max be the maximum (believable) score (here
    # x_max  = 447) and let x_med be the median score (here x_med =
    # 19).  The  scores are transformed via the function f(x) = a *
    # log x + b, s.t. f(x_med) = 300 and f(x_max) = 1000.  Solving
    # for a and b, you obtain  b = (300 log x_max - 1000 log x_med) /
    # (log x_max - log x_med), a = (1000 - b) / log x_max.  Here a =
    # 221.65, b = -352.64.

    # trackDb.ra file
#track phastConsElements
#shortLabel phastConsElements
#longLabel phastCons Conserved Elements, melanogaster/yakuba/pseudoobscura
#group compGeno
#priority 105
#visibility hide
#spectrum on
#color 0,60,120
#altColor 200,220,255
#exonArrows off
#type bed 12 .

    # should gzip or even delete contents of SS directory when done


# PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 8/10/04 angie)
# NOTE - REPLACED, SEE "NEW PHASTCONS" SECTION BELOW
    # step 1: fit phylogenetic model, with rate variation (discrete
    # gamma model) first we need to extract the sufficient stats for
    # the phylogenetic model from the MAFs.  In this case, we don't
    # care about the order of the tuples (allows for much more compact
    # representation)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/phastCons4way
    cd /cluster/data/dm1/bed/phastCons4way
    mkdir unordered
    foreach f (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/chr*.maf)
      /cluster/bin/phast/msa_view $f --in-format MAF --out-format SS \
            --order dm1,droYak1,dp2,anoGam1 \
            --unordered-ss  > unordered/$f:t:r.ss
    end
    # aggregate for whole genome
    ls unordered/chr*.ss > fnames
    /cluster/bin/phast/msa_view "*fnames" -i SS \
      --aggregate dm1,droYak1,dp2,anoGam1 -o SS --unordered-ss \
      > all.ss
    # estimate parameters; should be very fast in this case
    /cluster/bin/phast/phyloFit --msa all.ss \
      --tree "(((dm1,droYak1),dp2),anoGam1)" \
      --msa-format SS --subst-mod REV --nrates 10 --out-root rev-dg
    # make sure model looks reasonable
    cat rev-dg.mod
#ALPHABET: A C G T 
#ORDER: 0
#SUBST_MOD: REV
#NRATECATS: 10
#ALPHA: 1.343144
#TRAINING_LNL: -311520966.123671
#BACKGROUND: 0.276938 0.223190 0.223142 0.276730 
#RATE_MAT:
#  -0.926281    0.197688    0.496369    0.232223 
#   0.245294   -1.090671    0.230605    0.614772 
#   0.616038    0.230655   -1.092265    0.245573 
#   0.232398    0.495831    0.198018   -0.926247 
#TREE: (((dm1:0.056418,droYak1:0.074515):0.111186,dp2:0.219098):0.288564,anoGam1:0.288564);

    # Since anopheles is at such a large distance (est. divergence of 
    # branches that became drosophila & anopheles: ~250mya, as opposed to 
    # 35-40mya for mel-pseudo and 10-15mya for mel-yakuba), the branch length
    # between anopheles and pseudoobscura is underestimated above.  Adam 
    # scaled the lengths according to the published est. mya's, so 
    # edit rev-dg.mod's TREE line to this:
#TREE: (((dm1:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:2.66);
    # A new way to arrive at that: put Mya estimates in 4way.nh, and 
    # merge with the (more reliable) 3-way estimated tree:
    echo "(((dm1:10,droYak1:10):30,dp2:40):210,anoGam1:250);" \
    > 4wayMya.nh
    /cluster/bin/phast/tree_doctor /cluster/data/dm1/bed/phastCons/rev-dg.mod \
      --rename '1->dm1; 2->droYak1; 3->dp2; 4->anoGam1' \
    | /cluster/bin/phast/tree_doctor - --merge 4wayMya.nh
#TREE: (((dm1:0.057640,droYak1:0.073631):0.142750,dp2:0.190334):1.083495,anoGam1:1.28987
    # Very similar to Adam's tree with a little manual redistribution of 
    # weights from ancestors to outgroups at the outermost 2 nodes.

    # beware of zero branch lengths (indicates bad topology) or very
    # large branch lengths (much greater than one; indicates
    # convergence problems).  Can try without --nrates or with
    # --subst-mod HKY85 to double check (parameter estimates should be
    # in the same ballpark, REV + dG likelihood should be a bit
    # better).  Also can write a log file to monitor convergence
    # (--log).  Also watch out for alpha much less than one.
    # Sometimes it's helpful to produce a rendering of the tree using
    # draw_tree (run on the *.nh file)

    # step 2: run phastCons.  First partition the alignments into
    # bite-sized chunks.  This time we need the ordered version of the
    # SS format.
    set WINSIZE=1000000
    set WINOVERLAP=0
    mkdir SS
    foreach f (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf)
      set chr=$f:t:r
      set c=`echo $chr | sed 's/chr//'`
      echo $f $chr $c
      mkdir SS/$c
      /cluster/bin/phast/msa_split $f -i MAF -o SS \
        -O dm1,droYak1,dp2,anoGam1 -M /cluster/data/dm1/$c/$chr.fa \
        -w ${WINSIZE},${WINOVERLAP} -r SS/$c/$chr -I 1000 -d 1 -B 5000
    end
    # save some space and I/O time:
    gzip SS/*/*.ss
    ssh kk
    cd /cluster/data/dm1/bed/phastCons4way
    cat << '_EOF_' > doPostProbs
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set TMP=/scratch/phastCons

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --cut-at 2 --nrates 10 \
  --transitions 0.030,0.015 --quiet \
  > ${TMP}/$root.pp
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz
rm $TMP/$root.pp
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doPostProbs
    # Note: the --cut-params arguments above are approximate
    # likelihood estimates obtained by running phastCons *without* the
    # --cut-params argument on four or five different windows (all
    # gave similar results).  They may need to change for different
    # data sets.  Careful, though: the parameter estimation procedure
    # is a little unstable.  Be sure to create a log file and monitor
    # for convergence.
    rm -f jobs.lst
    foreach f (`ls -1S SS/*/*.ss.gz`)
      echo './doPostProbs {check in exists+ '$f'}' \
      >> jobs.lst
    end
    para create jobs.lst 
    para try ; para push ; etc....
#Completed: 134 of 134 jobs
#Average job time:                  23s       0.38m     0.01h    0.00d
#Longest job:                       30s       0.50m     0.01h    0.00d
#Submission to last job:           187s       3.12m     0.05h    0.00d

    # now create wiggle track
    # NOTE: might want to integrate with the multiz track instead of
    # keeping separate
    ssh kksilo
    cd /cluster/data/dm1/bed/phastCons4way
    mkdir wib
    foreach dir (POSTPROBS/*)
        echo $dir
        set chr=$dir:t
        zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \
            wigAsciiToBinary -chrom=$chr \
            -wibFile=wib/${chr}_phastCons4way stdin
    end
    ssh hgwdev
    mkdir -p /gbdb/dm1/wib/mzDy1Dp2Ag1_phast
    cd /cluster/data/dm1/bed/phastCons4way
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phast/
    hgLoadWiggle dm1 mzDy1Dp2Ag1_phast_wig \
      -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phast wib/*.wig

    # step 3:  predictions of conserved elements
    # (could do these at the same time as step 2, but we want to use
    # different --rates-cut params)
    ssh kk
    cd /cluster/data/dm1/bed/phastCons4way
    cat << '_EOF_' > doElements
#!/bin/csh -fe

set PHAST=/cluster/home/acs/phast/bin
set TMP=/scratch/phastCons

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
mkdir -p PREDICTIONS/$chrom
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --nrates 10 \
  --viterbi PREDICTIONS/$chrom/$root.bed --score --no-post-probs \
  --transitions 0.030,0.015 --quiet --seqname $chrom
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doElements
    rm -f jobs.els.lst
    foreach f (SS/*/*.ss.gz)
      echo 'doElements {check in exists+ '$f'}' >> jobs.els.lst
    end
    para create jobs.els.lst
    para try ; para push ; etc....
#Completed: 134 of 134 jobs
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest job:                        9s       0.15m     0.00h    0.00d
#Submission to last job:           287s       4.78m     0.08h    0.00d

    # Reloaded 8/24/04 with corrected transform / new script:
    # Create bed track: tweak scores and names, trim the unused strand column.
    cat PREDICTIONS/*/*.bed \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore \
    > all.bed

    hgLoadBed dm1 phastConsElements all.bed
    featureBits dm1 phastConsElements
#22859943 bases of 126527731 (18.067%) in intersection

    # make top-5000 list and launcher on Adam's home page:
    sort -k5,5nr raw.bed | head -5000 > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-4way \
      "top 5000 conserved elements (4way)" dm1


# NEW PHASTCONS MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES (DONE 9/24/04 angie)
    ssh kksilo
    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/dm1/chrom
    cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.split
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.split
    set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm1/chrom
set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm1,droYak1,dp2,anoGam1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  25s       0.42m     0.01h    0.00d
#Longest job:                       60s       1.00m     0.02h    0.00d
#Submission to last job:            60s       1.00m     0.02h    0.00d
    cd ..

    # use the model previously estimated (seein phastCons4way as a 
    # starting model.
    cp /cluster/data/dm1/bed/phastCons4way/rev-dg.mod starting-tree.mod
    # -- Because of the very long branch length to anoGam1 being pretty 
    # much impossible to estimate from alignment data, edit that file to 
    # reduce the anoGam1 branch length from 2.66 to 0.5.  Otherwise 
    # estimation process blows up.  So our starting tree becomes
#TREE: (((dm1:0.058,droYak1:0.074):0.133,dp2:0.200):0,anoGam1:0.5);

    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
    # add up the C and G:
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
    # Great, use 0.446 as the --gc parameter in phastCons below:.

    # Now set up cluster job to estimate model parameters.  
    # Use parameters iteratively determined for dm2 -- see makeDm2.doc:
    # --target-coverage 0.20 --expected-lengths 25
    # Parameters will be estimated separately for each alignment fragment 
    # then will be combined across fragments.

    mkdir run.estimate
    cd run.estimate
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing --expected-lengths 25 \
    --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk9
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                 135s       2.26m     0.04h    0.00d
#Longest job:                      258s       4.30m     0.07h    0.00d
#Submission to last job:           350s       5.83m     0.10h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
#ave.cons.mod:TREE: (((dm1:0.019916,droYak1:0.025549):0.038378,dp2:0.068605):0.085662,anoGam1:0.085662);
#ave.noncons.mod:TREE: (((dm1:0.081971,droYak1:0.105403):0.161271,dp2:0.290165):0.361849,anoGam1:0.361849);
    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  It's all downhill
    # from here.
    ssh kk9
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.phast
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \
    --idpref $pref \
    --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                  26s       0.43m     0.01h    0.00d
#Longest job:                       53s       0.88m     0.01h    0.00d
#Submission to last job:            55s       0.92m     0.02h    0.00d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons
    featureBits dm1 all.bed
#37105867 bases of 126527731 (29.326%) in intersection
    # OK, close enough.
    hgLoadBed dm1 phastConsElements all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.wib
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.wib
    rm -rf /cluster/bluearc/dm1/phastCons/wib
    mkdir -p /cluster/bluearc/dm1/phastCons/wib
    cat << 'EOF' > doWigAsciiToBinary
#!/bin/csh -ef
set chr = $1
zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigAsciiToBinary -chrom=$chr \
    -wibFile=/cluster/bluearc/dm1/phastCons/wib/${chr}_phastCons stdin 
'EOF'
# << for emacs
    chmod a+x doWigAsciiToBinary
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigAsciiToBinary $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                  19s       0.31m     0.01h    0.00d
#Longest job:                       47s       0.78m     0.01h    0.00d
#Submission to last job:            47s       0.78m     0.01h    0.00d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm1/phastCons/wib .
    rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir -p /gbdb/dm1/wib/mzDy1Dp2Ag1_phast
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phast/
    hgLoadWiggle dm1 mzDy1Dp2Ag1_phast_wig \
      -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phast wib/*.wig

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-4way \
      "top 5000 conserved elements (4way)" dm1

    # and clean up bluearc.
    rm -r /cluster/bluearc/dm1/phastCons
    rm -r /cluster/bluearc/dm1/chrom


# LOAD ENSEMBL GENES (NEVER MIND 6/16/04 angie)
    # OK, never mind, Ensembl just imported the BDGP / Flybase genes.

# BLAT FlyBase predicted DM proteins against DM
    ssh hgwdev
    cd /cluster/data/dm1/bed
    mkdir blat.dm1FB
    cd blat.dm1FB
    pepPredToFa dm1 bdgpGenePep dm1FB.fa
    hgPepPred dm1 generic blastFBPep00 dm1FB.fa
    ssh kk
    cd /cluster/data/hg17/bed/blat.dm1FB
    cat << '_EOF_' > blatSome
#!/bin/csh -fe
/cluster/bin/i386/blat -t=dnax -q=prot -out=pslx $1 $2 $3
'_EOF_'
    # << keep emacs happy
    chmod +x blatSome
    ls -1S /iscratch/i/dm1/nib/*.nib > bug.lst
    mkdir fbfas
    cd fbfas
    faSplit sequence ../dm1FB.fa 5000 kg
    cd ..
    ls -1S fbfas/*.fa > fb.lst
    cat << '_EOF_' > blatGsub
#LOOP
blatSome $(path1) {check in line $(path2)} {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << keep emacs happy
    gensub2 human.lst kg.lst blatGsub blatSpec
    mkdir psl
    cd psl
    foreach i (`cat ../bug.lst`)
	mkdir `basename $i .nib`
    end
    cd ..
    para create blatSpec
    para push

53361 jobs in batch
Completed: 53361 of 53361 jobs
CPU time in finished jobs:     640171s   10669.52m   177.83h    7.41d  0.020 y
IO & Wait Time:                144105s    2401.75m    40.03h    1.67d  0.005 y
Average job time:                  15s       0.24m     0.00h    0.00d
Longest job:                     1924s      32.07m     0.53h    0.02d
Submission to last job:          3506s      58.43m     0.97h    0.04d

    ssh eieio
    cd /cluster/data/dm1/bed/blat.dm1FB
    pslSort dirs raw.psl /tmp psl/*
    pslReps -nohead -minCover=0.9 -minAli=0.9 raw.psl cov90.psl /dev/null
    sort -rn cov90.psl | pslUniq stdin dm1FB.psl
    pslxToFa dm1FB.psl dm1FB_ex.fa -liftTarget=genome.lft -liftQuery=protein.lft
    fbName dm1 dm1FB.psl blastFBRef00
    ssh hgwdev
    cd /cluster/data/dm1/bed/blat.dm1FB
    hgsql dm1 < ~/kent/src/lib/hg/blastRef.sql
    echo "rename table blastRef to blastFBRef00" | hgsql dm1
    echo "load data local infile 'blastFBRef00' into table blastFBRef00" | hgsql dm1

# MITOPRED DATA FOR HGGENE (DONE 8/10/04 angie)
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/mitopred
    cd /cluster/data/dm1/bed/mitopred
    wget http://mitopred.sdsc.edu/data/fly_30.out
    perl -wpe 's/^(\S+)\s+\S+\s+(.*)/$1\t$2/' fly_30.out > mitopred.tab
    cat > mitopred.sql << '_EOF_'
# Prediction of nuclear-encoded mito. proteins from http://mitopred.sdsc.edu/
CREATE TABLE mitopred (
    name varchar(10) not null,      # SwissProt ID
    confidence varchar(8) not null, # Confidence level
              #Indices
    PRIMARY KEY(name(6))
);
'_EOF_'
    # << this line makes emacs coloring happy
    hgsql dm1 < mitopred.sql
    hgsql dm1 -e 'load data local infile "mitopred.tab" into table mitopred'

# MAKE Human Proteins track
    mkdir -p /cluster/data/dm1/bed/tblastn.hg16KG
    cd /cluster/data/dm1/bed/tblastn.hg16KG
    ls -1S /iscratch/i/dm1/blastDb/*.nsq | sed "s/\.nsq//" > bug.lst
    exit

    # back to kksilo
    cd /cluster/data/dm1/bed/tblastn.hg16KG
    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc bug.lst | awk "{print \\\$1}"`\)
    # 41117/(150000/578) = 158.437507
    split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	    exit 0
	fi
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 bug.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/dm1/bed/tblastn.hg16KG
    para create blastSpec
    para push

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/dm1/bed/tblastn.hg16KG
    para create chainSpec
    para push

    exit
    # back to kksilo
    cd /cluster/data/dm1/bed/tblastn.hg16KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastHg16KG.psl
    cd ..
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/dm1/bed/tblastn.hg16KG
    hgLoadPsl dm1 blastHg16KG.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn


# BLASTZ DM2 (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.dm2.2004-09-15
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15
    cat << '_EOF_' > DEF
# dm1 vs. dm2 (for liftOver)
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=/cluster/bin/penn/blastz-run
BLASTZ=/cluster/bin/penn/blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

SEQ2_DIR=/iscratch/i/dm2/nib
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.dm2.2004-09-15

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    # cluster run -- use rack 9 to avoid getting in the way of hg17.
    ssh kk9
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run
    para create j
    para try, check, push, check, ....
#Completed: 483 of 483 jobs
#Average job time:                 119s       1.99m     0.03h    0.00d
#Longest job:                     1238s      20.63m     0.34h    0.01d
#Submission to last job:          1625s      27.08m     0.45h    0.02d

    # back on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > jobList
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/run.1
    para create jobList
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                   8s       0.13m     0.00h    0.00d
#Longest job:                       23s       0.38m     0.01h    0.00d
#Submission to last job:            35s       0.58m     0.01h    0.00d
    cd ..
    rm -r raw

    # third run: lav -> axt
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15
    mkdir axtChrom run.2
    cd run.2
    cat << '_EOF_' > do.csh
#!/bin/csh -ef
cd $1
set chr = $1:t
cat `ls -1 *.lav | sort -g` \
| lavToAxt stdin \
    /cluster/bluearc/drosophila/dm1/nib /iscratch/i/dm2/nib stdout \
| axtSort stdin ../../axtChrom/$chr.axt 
'_EOF_'
    # << this line keeps emacs coloring happy
    chmod a+x do.csh
    cp /dev/null jobList
    foreach d (../lav/chr*)
      echo "do.csh $d" >> jobList
    end
    para create jobList
    para try, check, push, check
#Completed: 11 of 11 jobs
#Average job time:                  41s       0.68m     0.01h    0.00d
#Longest job:                       62s       1.03m     0.02h    0.00d
#Submission to last job:            62s       1.03m     0.02h    0.00d


# CHAIN DM2 BLASTZ (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/dm2/nib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  20s       0.33m     0.01h    0.00d
#Longest job:                       62s       1.03m     0.02h    0.00d
#Submission to last job:            62s       1.03m     0.02h    0.00d

    # now on the file server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's [looks like could use some filtering,
    # didn't abridge repeats]
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Don't load chains into database -- just doing this for liftOver.


# NET DM2 BLASTZ (FOR LIFTOVER EXPERIMENT) (DONE 9/15/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dm2.2004-09-15/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Don't bother with classification and db-loading -- just make 
    # liftOver-suitable (single-coverage from net) chains:
    netSplit noClass.net net
    mkdir subset
    foreach f (chain/*.chain)
      echo subsetting net/$f:t:r.net, $f to subset/$f:t
      netChainSubset net/$f:t:r.net $f subset/$f:t
    end
    cat subset/*.chain > dm1ToDm2.over.chain

    # clean up
    rm -r net subset


# MAKE AN 11.OOC FILE FOR BLAT (DONE 9/16/04 angie)
    # Use -repMatch=100 (based on size -- for human we use 1024, and 
    # fly size is 126527731 / 2866216770 = ~4.4% of human judging by 
    # gapless genome size from featureBits -- we would use 45, but 
    # bump that up a bit to be more conservative)
    ssh kolossus
    mkdir /cluster/data/dm1/bed/ooc
    cd /cluster/data/dm1/bed/ooc
    ls -1 /cluster/data/dm1/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/dm1/11.ooc -repMatch=100
#Wrote 2917 overused 11-mers to /cluster/bluearc/dm1/11.ooc
    ssh kkr1u00
    cp -p /cluster/bluearc/dm1/*.ooc /iscratch/i/dm1/
    iSync


# BLAT DM2 TO GET LIFTOVER CHAINS (DONE 9/17/04 angie)
    # Note: this process makes /cluster/data/dm1/bed/blat.dm2.2004-09-17/ .
    # Previously I made a blat.dm2.2004-09-16/ using this process (mostly),
    # except for the split and lift steps.  I used the dm2 2Mb chunks and 
    # dm2/jkStuff/liftAll.lft.  Blat run was much slower... liftOver 
    # performance was about the same, so use the quicker smaller chunks.
    ssh kkr1u00
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-split.csh \
      dm2 /iscratch/i/dm2/nib
    # Do what its output says to do next:
    ssh kk (or kk9)
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-align.csh \
      dm1 /iscratch/i/dm1/nib dm2 /iscratch/i/dm2/split3k \
      /iscratch/i/dm1/11.ooc
    # Do what its output says to do next:
    cd /cluster/data/dm1/bed/blat.dm2.2004-09-16/run
    para try, check, push, check, ...
#Completed: 143 of 143 jobs
#Average job time:                  37s       0.62m     0.01h    0.00d
#Longest job:                      282s       4.70m     0.08h    0.00d
#Submission to last job:           283s       4.72m     0.08h    0.00d
    ssh kksilo
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-lift.csh dm1 dm2 \
      /iscratch/i/dm2/split3k
    # Do what its output says to do next:
    ssh kki
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-chain.csh \
      dm1 /iscratch/i/dm1/nib dm2 /iscratch/i/dm2/nib
    # Do what its output says to do next:
    cd /cluster/data/dm1/bed/blat.dm2.2004-09-16/chainRun
    para try, check, push, check, ...
#Completed: 13 of 13 jobs
#Average job time:                   7s       0.12m     0.00h    0.00d
#Longest job:                       11s       0.18m     0.00h    0.00d
#Submission to last job:            11s       0.18m     0.00h    0.00d
    ssh kksilo
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-net.csh dm1 dm2
    # Do what its output says to do next:
    ssh hgwdev
    ~/kent/src/hg/makeDb/makeLoChain/makeLoChain-load.csh dm1 dm2


# BLASTZ D.ANANASSAE (DONE 11/4/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.droAna1.2004-11-03
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03
    ln -s blastz.droAna1.2004-11-03 /cluster/data/dm1/bed/blastz.droAna1
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.ananassae
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. ananassae
SEQ2_DIR=/iscratch/i/droAna1/chunksUnsplit
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.droAna1.2004-11-03

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run
    para create j
    para try, check, push, check, ....
#Completed: 9723 of 9723 jobs
#Average job time:                  22s       0.36m     0.01h    0.00d
#Longest job:                      832s      13.87m     0.23h    0.01d
#Submission to last job:          1145s      19.08m     0.32h    0.01d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  32s       0.54m     0.01h    0.00d
#Longest job:                       42s       0.70m     0.01h    0.00d
#Submission to last job:            68s       1.13m     0.02h    0.00d
    cd ..
    rm -r raw

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib \
            /cluster/data/droAna1/droAna1.2bit stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN ANANASSAE BLASTZ (DONE 11/4/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/droAna1/droAna1.2bit stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droAna1/droAna1.2bit \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest job:                       19s       0.32m     0.01h    0.00d
#Submission to last job:            19s       0.32m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDroAna1 $i
    end


# NET ANANASSAE BLASTZ (DONE 11/4/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain
    netClass -noAr noClass.net dm1 droAna1 ananassae.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain
    rm noClass.net
    netFilter -syn ananassae.net > ananassaeSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/axtChain
    netFilter -minGap=10 ananassae.net |  hgLoadNet dm1 netDroAna1 stdin
    netFilter -minGap=10 ananassaeSyn.net \
    | hgLoadNet dm1 netSyntenyDroAna1 stdin


# GENERATE DROANA1 AXTNET AND MAF FOR MULTIZ (DONE 11/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1/axtChain
    netSplit ananassae.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/droAna1/droAna1.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/droAna1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=droAna1.
    end


# MAKE VSDROANA1 DOWNLOADABLES (DONE 11/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droAna1
    nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroAna1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroAna1
    cp -p /cluster/data/dm1/bed/blastz.droAna1/axtChain/all.chain.gz \
      ananassae.chain.gz
    cp -p /cluster/data/dm1/bed/blastz.droAna1/axtChain/ananassae.net.gz .
    cp -pR /cluster/data/dm1/bed/blastz.droAna1/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# BLASTZ D.MOJAVENSIS (DONE 11/4/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04
    ln -s blastz.droMoj1.2004-11-04 /cluster/data/dm1/bed/blastz.droMoj1
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.mojavensis
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. mojavensis
SEQ2_DIR=/iscratch/i/droMoj1/chunks
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.droMoj1.2004-11-04

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run
    para create j
    para try, check, push, check, ....
    # I should have split these a little coarser -- avg job time too small.
#Completed: 18396 of 18396 jobs
#Average job time:                  20s       0.33m     0.01h    0.00d
#Longest job:                      472s       7.87m     0.13h    0.01d
#Submission to last job:           629s      10.48m     0.17h    0.01d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  38s       0.63m     0.01h    0.00d
#Longest job:                       43s       0.72m     0.01h    0.00d
#Submission to last job:            75s       1.25m     0.02h    0.00d
    cd ..
    rm -r raw

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib \
            /cluster/data/droMoj1/droMoj1.2bit stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN MOJAVENSIS BLASTZ (DONE 11/4/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/droMoj1/droMoj1.2bit stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droMoj1/droMoj1.2bit \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                   6s       0.10m     0.00h    0.00d
#Longest job:                       11s       0.18m     0.00h    0.00d
#Submission to last job:            11s       0.18m     0.00h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDroMoj1 $i
    end


# NET MOJAVENSIS BLASTZ (DONE 11/4/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain
    netClass -noAr noClass.net dm1 droMoj1 mojavensis.net 

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain
    rm noClass.net
    netFilter -syn mojavensis.net > mojavensisSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/axtChain
    netFilter -minGap=10 mojavensis.net |  hgLoadNet dm1 netDroMoj1 stdin
    netFilter -minGap=10 mojavensisSyn.net \
    | hgLoadNet dm1 netSyntenyDroMoj1 stdin


# GENERATE DROMOJ1 AXTNET AND MAF FOR MULTIZ (DONE 11/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1/axtChain
    netSplit mojavensis.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/droMoj1/droMoj1.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/droMoj1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=droMoj1.
    end


# MAKE VSDROMOJ1 DOWNLOADABLES (DONE 11/5/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droMoj1
    nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroMoj1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroMoj1
    cp -p /cluster/data/dm1/bed/blastz.droMoj1/axtChain/all.chain.gz \
      mojavensis.chain.gz
    cp -p /cluster/data/dm1/bed/blastz.droMoj1/axtChain/mojavensis.net.gz .
    cp -pR /cluster/data/dm1/bed/blastz.droMoj1/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# BLASTZ D.VIRILIS (DONE 11/7/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.droVir1.2004-11-07
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07
    ln -s blastz.droVir1.2004-11-07 /cluster/data/dm1/bed/blastz.droVir1
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.virilis
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. virilis
SEQ2_DIR=/iscratch/i/droVir1/chunks
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.droVir1.2004-11-07

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run
    para create j
    para try, check, push, check, ....
#Completed: 2121 of 2121 jobs
#Average job time:                  81s       1.34m     0.02h    0.00d
#Longest job:                      796s      13.27m     0.22h    0.01d
#Submission to last job:          1412s      23.53m     0.39h    0.02d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest job:                       14s       0.23m     0.00h    0.00d
#Submission to last job:            21s       0.35m     0.01h    0.00d
    cd ..
    rm -r raw

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib \
            /cluster/data/droVir1/droVir1.2bit stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN VIRILIS BLASTZ (DONE 11/7/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/droVir1/droVir1.2bit stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droVir1/droVir1.2bit \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest job:                       16s       0.27m     0.00h    0.00d
#Submission to last job:            16s       0.27m     0.00h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDroVir1 $i
    end


# NET VIRILIS BLASTZ (DONE 11/7/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain
    netClass -noAr noClass.net dm1 droVir1 virilis.net 

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain
    rm noClass.net
    netFilter -syn virilis.net > virilisSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/axtChain
    netFilter -minGap=10 virilis.net |  hgLoadNet dm1 netDroVir1 stdin
    netFilter -minGap=10 virilisSyn.net \
    | hgLoadNet dm1 netSyntenyDroVir1 stdin


# GENERATE DROVIR1 AXTNET AND MAF FOR MULTIZ (DONE 11/7/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1/axtChain
    netSplit virilis.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/droVir1/droVir1.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/droVir1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=droVir1.
    end


# MAKE VSDROVIR1 DOWNLOADABLES (DONE 11/7/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droVir1
    nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroVir1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroVir1
    cp -p /cluster/data/dm1/bed/blastz.droVir1/axtChain/all.chain.gz \
      virilis.chain.gz
    cp -p /cluster/data/dm1/bed/blastz.droVir1/axtChain/virilis.net.gz .
    cp -pR /cluster/data/dm1/bed/blastz.droVir1/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# BLASTZ A.MELLIFERA (DONE 11/8/04 angie)
    # Use human-fugu sensitivity settings, as with mosquito.
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08
    ln -s blastz.apiMel1.2004-11-08 /cluster/data/dm1/bed/blastz.apiMel1
    cat << '_EOF_' > DEF
# D.melanogaster vs. A.mellifera
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - A. mellifera
SEQ2_DIR=/iscratch/i/apiMel1/chunksUnsplit
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.apiMel1.2004-11-08

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run
    para create j
    para try, check, push, check, ....
#Completed: 6867 of 6867 jobs
#Average job time:                  87s       1.45m     0.02h    0.00d
#Longest job:                      833s      13.88m     0.23h    0.01d
#Submission to last job:          3109s      51.82m     0.86h    0.04d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  12s       0.20m     0.00h    0.00d
#Longest job:                       25s       0.42m     0.01h    0.00d
#Submission to last job:            85s       1.42m     0.02h    0.00d
    cd ..
    rm -r raw

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib \
            /cluster/data/apiMel1/apiMel1.2bit stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN MELLIFERA BLASTZ (DONE 11/8/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
         -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
         -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/apiMel1/apiMel1.2bit stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/apiMel1/apiMel1.2bit \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest job:                       69s       1.15m     0.02h    0.00d
#Submission to last job:            69s       1.15m     0.02h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainApiMel1 $i
    end


# NET MELLIFERA BLASTZ (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain
    netClass -noAr noClass.net dm1 apiMel1 mellifera.net 

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain
    rm noClass.net
    netFilter -syn mellifera.net > melliferaSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/axtChain
    netFilter -minGap=10 mellifera.net |  hgLoadNet dm1 netApiMel1 stdin
    netFilter -minGap=10 melliferaSyn.net \
    | hgLoadNet dm1 netSyntenyApiMel1 stdin


# GENERATE APIMEL1 AXTNET AND MAF FOR MULTIZ (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1/axtChain
    netSplit mellifera.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/apiMel1/apiMel1.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/apiMel1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=apiMel1.
    end


# MAKE VSAPIMEL1 DOWNLOADABLES (DONE 11/8/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.apiMel1
    nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsApiMel1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsApiMel1
    cp -p /cluster/data/dm1/bed/blastz.apiMel1/axtChain/all.chain.gz \
      mellifera.chain.gz
    cp -p /cluster/data/dm1/bed/blastz.apiMel1/axtChain/mellifera.net.gz .
    cp -pR /cluster/data/dm1/bed/blastz.apiMel1/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# DATAMINE FOR PHYLOGENETIC TREE INFO FOR 12-FLY PROJECT (DONE 11/9/04 angie)
    # Google... -> Entrez "Tamura K" ->
#Temporal patterns of fruit fly (Drosophila) evolution revealed by mutation clocks.
#Tamura K, Subramanian S, Kumar S.
#http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Retrieve&db=pubmed&dopt=Abstract&list_uids=12949132
#Our analysis of 2977 pairwise sequence comparisons from 176 nuclear
#genes reveals a long-term fruit fly mutation clock ticking at a rate
#of 11.1 mutations per kilobase pair per Myr. Genomic mutation
#clock-based timings of the landmark speciation events leading to the
#evolution of D. melanogaster show that it shared most recent common
#ancestry
# 5.4 MYA with D. simulans, 
#12.6 MYA with D. erecta+D. orena, 
#12.8 MYA with D. yakuba+D. teisseri, 
#35.6 MYA with the takahashii subgroup, 
#41.3 MYA with the montium subgroup, 
#44.2 MYA with the ananassae subgroup, 
#54.9 MYA with the obscura group, 
#62.2 MYA with the willistoni group, and 
#62.9 MYA with the subgenus Drosophila.

    # 12-species project whitepaper (June 2003):
# http://genome.gov/Pages/Research/Sequencing/SeqProposals/Drosophila.pdf
# Eventual 12-fly tree from page 3:
   (((((((melanogaster (simulans sechellia))
	 yakuba)
	erecta)
       ananassae)
      (pseudoobscura persimilis))
     willistoni)
    (grimshawi (mojavensis virilis)))
# Rough MYA from page 3 (Some 2-4x shorter estimates than Tamura et al!):
   (((((((melanogaster (simulans sechellia):1):4
	 yakuba:5)
	erecta:7)
       ananassae:13)
      (pseudoobscura persimilis):1):27
     willistoni):37
    (grimshawi (mojavensis virilis):25):7)

    # chainLink coverage of species aligned to date:
    ssh hgwdev
    foreach db (DroYak1 DroAna1 Dp2 DroVir1 DroMoj1 AnoGam1 ApiMel1)
      echo -n "$db: "
      nice featureBits dm1 chain${db}Link
    end
#DroYak1: 108256417 bases of 126527731 (85.559%) in intersection
#DroAna1: 82740388 bases of 126527731 (65.393%) in intersection
#Dp2:     73920914 bases of 126527731 (58.423%) in intersection
#DroVir1: 53742261 bases of 126527731 (42.475%) in intersection
#DroMoj1: 49844756 bases of 126527731 (39.394%) in intersection
#AnoGam1: 19437860 bases of 126527731 (15.363%) in intersection
#ApiMel1: 16555940 bases of 126527731 (13.085%) in intersection


# VAR_MULTIZ INSECTS TO DATE (DONE 11/9/04 angie)
    # Today's tree (8-way):
    # ((((((dm1 droYak1) droAna1) dp2) (droVir1 droMoj1)) anoGam1) apiMel1)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/multiz8way.2004-11-09
    ln -s /cluster/data/dm1/bed/multiz8way.2004-11-09 \
      /cluster/data/dm1/bed/multiz8way
    cd /cluster/data/dm1/bed/multiz8way
    # Setup: Copy pairwise MAF to /cluster/bluearc:
    mkdir /cluster/bluearc/flyVarMultiz8way
    foreach db (droYak1 droAna1 dp2 droVir1 droMoj1 anoGam1 apiMel1)
      cp -pR /cluster/data/dm1/bed/blastz.$db/mafNet \
        /cluster/bluearc/flyVarMultiz8way/$db
    end
    # Make output dir:
    mkdir maf
    # Create script to run var_multiz in the right order:
    cat << '_EOF_' > doVarMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/flyVarMultiz8way.$chr
mkdir $tmp

set REF = dm1.$chr
set YAK = /cluster/bluearc/flyVarMultiz8way/droYak1/$chr.maf
set ANA = /cluster/bluearc/flyVarMultiz8way/droAna1/$chr.maf
set PSE = /cluster/bluearc/flyVarMultiz8way/dp2/$chr.maf
set VIR = /cluster/bluearc/flyVarMultiz8way/droVir1/$chr.maf
set MOJ = /cluster/bluearc/flyVarMultiz8way/droMoj1/$chr.maf
set ANO = /cluster/bluearc/flyVarMultiz8way/anoGam1/$chr.maf
set API = /cluster/bluearc/flyVarMultiz8way/apiMel1/$chr.maf
set DEST = /cluster/data/dm1/bed/multiz8way/maf/$chr.maf

set VMZ = /cluster/bin/penn/var_multiz
set PROJECT = /cluster/bin/penn/maf_project

if ( -s $YAK && -s $ANA ) then 
    echo "Aligning $YAK $ANA..."
    $VMZ $YAK $ANA 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAna.maf
else if ( -s $YAK ) then
    cp $YAK $tmp/$chr.YakAna.maf
else if ( -s $ANA ) then
    cp $ANA $tmp/$chr.YakAna.maf
endif

if ( -s $PSE && -s $tmp/$chr.YakAna.maf ) then
    echo "Adding $PSE..."
    $VMZ $tmp/$chr.YakAna.maf $PSE 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPse.maf
else if ( -s $PSE ) then
    cp $PSE $tmp/$chr.YakAnaPse.maf
else if ( -s $tmp/$chr.YakAna.maf ) then
    cp $tmp/$chr.YakAna.maf $tmp/$chr.YakAnaPse.maf
endif 

# droVir1 and droMoj1 are a subtree -- run on just them, then fold in:
if ( -s $VIR && -s $MOJ ) then 
    echo "Aligning $VIR $MOJ..."
    $VMZ $VIR $MOJ 0 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf
else if ( -s $VIR ) then
    cp $VIR $tmp/$chr.VirMoj.maf
else if ( -s $MOJ ) then
    cp $MOJ $tmp/$chr.VirMoj.maf
endif

if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.YakAnaPse.maf ) then
    echo "Adding $tmp/$chr.VirMoj.maf..."
    $VMZ $tmp/$chr.YakAnaPse.maf $tmp/$chr.VirMoj.maf 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.VirMoj.maf ) then
    cp $tmp/$chr.VirMoj.maf $tmp/$chr.YakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.YakAnaPse.maf ) then
    cp $tmp/$chr.YakAnaPse.maf $tmp/$chr.YakAnaPseVirMoj.maf
endif 

if ( -s $ANO && -s $tmp/$chr.YakAnaPseVirMoj.maf ) then
    echo "Adding $ANO..."
    $VMZ $tmp/$chr.YakAnaPseVirMoj.maf $ANO 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMojAno.maf
else if ( -s $ANO ) then
    cp $ANO $tmp/$chr.YakAnaPseVirMojAno.maf
else if ( -s $tmp/$chr.YakAnaPseVirMoj.maf ) then
    cp $tmp/$chr.YakAnaPseVirMoj.maf $tmp/$chr.YakAnaPseVirMojAno.maf
endif 

if ( -s $API && -s $tmp/$chr.YakAnaPseVirMojAno.maf ) then
    echo "Adding $API..."
    $VMZ $tmp/$chr.YakAnaPseVirMojAno.maf $API 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.final.maf
    cp $tmp/$chr.final.maf $DEST
else if ( -s $API ) then
    cp $API $DEST
else if ( -s $tmp/$chr.YakAnaPseVirMojAno.maf ) then
    cp $tmp/$chr.YakAnaPseVirMojAno.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doVarMultiz.csh
    awk '{print "./doVarMultiz.csh " $1;}' /cluster/data/dm1/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /cluster/data/dm1/bed/multiz8way
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                1272s      21.19m     0.35h    0.01d
#Longest job:                     3747s      62.45m     1.04h    0.04d
#Submission to last job:          3747s      62.45m     1.04h    0.04d

    # make /gbdb/ links to 8way and pairwise maf files:
    ssh hgwdev
    mkdir -p /gbdb/dm1/multiz8way/maf/multiz8way
    ln -s /cluster/data/dm1/bed/multiz8way/maf/chr*.maf \
      /gbdb/dm1/multiz8way/maf/multiz8way/
    # pairwise setting from trackDb.ra entry:
    set pairwise = 041109
    foreach orgName (d_yakuba d_ananassae d_pseudoobscura d_virilis \
                     d_mojavensis a_gambiae a_mellifera)
      mkdir /gbdb/dm1/multiz8way/maf/${orgName}_$pairwise
    end
    ln -s /cluster/data/dm1/bed/blastz.droYak1.2004-05-22/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/d_yakuba_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.droAna1.2004-11-03/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/d_ananassae_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.dp2.2004-08-03/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/d_pseudoobscura_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.droVir1.2004-11-07/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/d_virilis_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.droMoj1.2004-11-04/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/d_mojavensis_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.anoGam1.2004-06-10/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/a_gambiae_$pairwise
    ln -s /cluster/data/dm1/bed/blastz.apiMel1.2004-11-08/mafNet/*.maf \
      /gbdb/dm1/multiz8way/maf/a_mellifera_$pairwise
    # load into database
    cd /tmp
    hgLoadMaf -warn dm1 multiz8way \
      -pathPrefix=/gbdb/dm1/multiz8way/maf/multiz8way
    foreach orgName (d_yakuba d_ananassae d_pseudoobscura d_virilis \
                     d_mojavensis a_gambiae a_mellifera)
      hgLoadMaf -WARN dm1 ${orgName}_$pairwise \
        -pathPrefix=/gbdb/dm1/multiz8way/maf/${orgName}_$pairwise
    end

    # put 8way MAF out for download
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz8way
    mkdir ../../zips/mzMafs
    foreach f (maf/*.maf)
      gzip -c $f > ../../zips/mzMafs/$f:t.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/multiz8way
    cd /usr/local/apache/htdocs/goldenPath/dm1/multiz8way
    mv /cluster/data/dm1/zips/mzMafs/* .
    rmdir /cluster/data/dm1/zips/mzMafs
    md5sum *.gz > md5sum.txt
    # make a README.txt

    # Cleanup
    rm -rf /cluster/bluearc/flyVarMultiz8way/


# PHASTCONS 8WAY (DONE 11/10/04 angie)
    ssh kksilo
    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/dm1/chrom
    cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.split
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.split
    set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm1/chrom
set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
# apiMel1-specific tweak for msa_split:
perl -wpe 's/(apiMel1\.Group\w+)\.(\d+)/$1_$2/' $1 > $tmpDir/$c.maf
${PHAST}/msa_split $tmpDir/$c.maf -i MAF -M ${FA_SRC}/$c.fa \
   -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1,apiMel1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.maf
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm1/bed/multiz8way/maf/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  50s       0.83m     0.01h    0.00d
#Longest job:                      130s       2.17m     0.04h    0.00d
#Submission to last job:           130s       2.17m     0.04h    0.00d
    cd ..

    # use the model previously estimated as a starting model...
    cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/starting-tree.mod .
    # then add the new species with scaled distance estimates.  
    # As long as they're reasonable, the estimation process should be 
    # able to converge on the right values.  
    $EDITOR starting-tree.mod
#TREE: ((((((dm1:0.058,droYak1:0.074):0.1,droAna1:0.17):0.33,dp2:0.200):0,(droVir1:0.3,droMoj1:0.3):0):0.2,anoGam1:0.5):0,apiMel1:0.5);

    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
    # add up the C and G:
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
    # Great, use 0.446 as the --gc parameter in phastCons below:.

    # Now set up cluster job to estimate model parameters.  
    # Use parameters iteratively determined for dm2 -- see makeDm2.doc:
    # --target-coverage 0.20 --expected-lengths 25
    # Parameters will be estimated separately for each alignment fragment 
    # then will be combined across fragments.

    mkdir run.estimate
    cd run.estimate
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing --expected-lengths 25 \
    --target-coverage 0.20 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk9
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                2054s      34.23m     0.57h    0.02d
#Longest job:                     4051s      67.52m     1.13h    0.05d
#Submission to last job:         64611s    1076.85m    17.95h    0.75d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
#ave.cons.mod:TREE: ((((((dm1:0.024149,droYak1:0.028681):0.037845,droAna1:0.074610):0.020246,dp2:0.078803):0.018879,(droVir1:0.046349,droMoj1:0.051929):0.065674):0.063543,anoGam1:0.171043):0.118046,apiMel1:0.118046);
#ave.noncons.mod:TREE: ((((((dm1:0.084284,droYak1:0.100215):0.137342,droAna1:0.270012):0.074910,dp2:0.285747):0.071341,(droVir1:0.165827,droMoj1:0.186367):0.245918):0.239605,anoGam1:0.624082):0.439708,apiMel1:0.439708);
    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  It's all downhill
    # from here.
    ssh kk9
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.phast
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 25 --target-coverage 0.20 --quiet --seqname $chr \
    --idpref $pref \
    --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                  15s       0.26m     0.00h    0.00d
#Longest job:                       22s       0.37m     0.01h    0.00d
#Submission to last job:            48s       0.80m     0.01h    0.00d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    # do in a way that avoids limits on numbers of args
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    featureBits dm1 all.bed
#38825802 bases of 126527731 (30.686%) in intersection
    # OK, close enough.
    hgLoadBed dm1 phastConsElements8way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.wib
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.wib
    rm -rf /cluster/bluearc/dm1/phastCons/wib
    mkdir -p /cluster/bluearc/dm1/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /cluster/bluearc/dm1/phastCons/wib
zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                  10s       0.16m     0.00h    0.00d
#Longest job:                       22s       0.37m     0.01h    0.00d
#Submission to last job:            22s       0.37m     0.01h    0.00d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm1/phastCons/wib .
    rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm1/multiz8way/wib
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz8way/wib/
    hgLoadWiggle dm1 phastCons8way \
      -pathPrefix=/gbdb/dm1/multiz8way/wib wib/*.wig

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-8way \
      "top 5000 conserved elements (8way)" dm1

    # and clean up bluearc.
    rm -r /cluster/bluearc/dm1/phastCons
# NOT DONE (I see more phastCons in the future :)
    rm -r /cluster/bluearc/dm1/chrom


# BLASTZ D. PSEUDOOBSCURA (DP3) (DONE 11/16/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.dp3.2004-11-16
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16
    ln -s blastz.dp3.2004-11-16 /cluster/data/dm1/bed/blastz.dp3
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.pseudoobscura
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
# unused: SEQ1_RMSK=
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. pseudoobscura
SEQ2_DIR=/iscratch/i/dp3/nib
# unused: SEQ2_RMSK=
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.dp3.2004-11-16

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run
    para create j
    para try, check, push, check, ....
#Completed: 504 of 504 jobs
#Average job time:                 196s       3.26m     0.05h    0.00d
#Longest job:                      931s      15.52m     0.26h    0.01d
#Submission to last job:          1285s      21.42m     0.36h    0.01d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  11s       0.18m     0.00h    0.00d
#Longest job:                       21s       0.35m     0.01h    0.00d
#Submission to last job:            27s       0.45m     0.01h    0.00d

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16
    rm -r raw
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib /cluster/data/dp3/nib stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN PSEUDOOBSCURA BLASTZ (DONE 11/17/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/dp3/nib stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/dp3/nib \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  13s       0.22m     0.00h    0.00d
#Longest job:                       25s       0.42m     0.01h    0.00d
#Submission to last job:            25s       0.42m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDp3 $i
    end


# NET PSEUDOOBSCURA BLASTZ (DONE 11/17/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain
    netClass -noAr noClass.net dm1 dp3 pseudoobscura.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain
    rm noClass.net
    netFilter -syn pseudoobscura.net > pseudoobscuraSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain
    netFilter -minGap=10 pseudoobscura.net |  hgLoadNet dm1 netDp3 stdin
    netFilter -minGap=10 pseudoobscuraSyn.net \
    | hgLoadNet dm1 netSyntenyDp3 stdin


# GENERATE DP3 AXTNET AND MAF FOR MULTIZ (DONE 11/17/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.dp3/axtChain
    netSplit pseudoobscura.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/dp3/dp3.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/dp3/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=dp3.
    end


# MAKE VSDP3 DOWNLOADABLES (DONE 11/17/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/zips/axtNet
    gzip -c \
      /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/all.chain \
      > /cluster/data/dm1/zips/pseudoobscura.chain.gz
    gzip -c \
      /cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtChain/pseudoobscura.net \
      > /cluster/data/dm1/zips/pseudoobscura.net.gz
    foreach f (/cluster/data/dm1/bed/blastz.dp3.2004-11-16/axtNet/chr*axt)
      gzip -c $f > /cluster/data/dm1/zips/axtNet/$f:t.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDp3
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDp3
    mv /cluster/data/dm1/zips/pseu*gz .
    mv /cluster/data/dm1/zips/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# VAR_MULTIZ FLIES + MOSQUITO (NO HONEYBEE) FOR PHASTCONS PAPER (DONE 11/18/04 angie)
    # Tree of genomes safe to use in paper (7-way):
    # (((((dm1 droYak1) droAna1) dp2) (droVir1 droMoj1)) anoGam1)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/multiz7way.2004-11-18
    ln -s /cluster/data/dm1/bed/multiz7way.2004-11-18 \
      /cluster/data/dm1/bed/multiz7way
    cd /cluster/data/dm1/bed/multiz7way
    # Normally would copy pairwise MAF to /cluster/bluearc, but I haven't
    # cleaned up /cluster/bluearc/flyVarMultiz8way so just reuse it.
    # Make output dir:
    mkdir maf
    # Create script to run var_multiz in the right order:
    cat << '_EOF_' > doVarMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/flyVarMultiz7way.$chr
mkdir $tmp

set REF = dm1.$chr
set YAK = /cluster/bluearc/flyVarMultiz8way/droYak1/$chr.maf
set ANA = /cluster/bluearc/flyVarMultiz8way/droAna1/$chr.maf
set PSE = /cluster/bluearc/flyVarMultiz8way/dp2/$chr.maf
set VIR = /cluster/bluearc/flyVarMultiz8way/droVir1/$chr.maf
set MOJ = /cluster/bluearc/flyVarMultiz8way/droMoj1/$chr.maf
set ANO = /cluster/bluearc/flyVarMultiz8way/anoGam1/$chr.maf
set DEST = /cluster/data/dm1/bed/multiz7way/maf/$chr.maf

set VMZ = /cluster/bin/penn/var_multiz
set PROJECT = /cluster/bin/penn/maf_project

if ( -s $YAK && -s $ANA ) then 
    echo "Aligning $YAK $ANA..."
    $VMZ $YAK $ANA 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAna.maf
else if ( -s $YAK ) then
    cp $YAK $tmp/$chr.YakAna.maf
else if ( -s $ANA ) then
    cp $ANA $tmp/$chr.YakAna.maf
endif

if ( -s $PSE && -s $tmp/$chr.YakAna.maf ) then
    echo "Adding $PSE..."
    $VMZ $tmp/$chr.YakAna.maf $PSE 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPse.maf
else if ( -s $PSE ) then
    cp $PSE $tmp/$chr.YakAnaPse.maf
else if ( -s $tmp/$chr.YakAna.maf ) then
    cp $tmp/$chr.YakAna.maf $tmp/$chr.YakAnaPse.maf
endif 

# droVir1 and droMoj1 are a subtree -- run on just them, then fold in:
if ( -s $VIR && -s $MOJ ) then 
    echo "Aligning $VIR $MOJ..."
    $VMZ $VIR $MOJ 0 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.VirMoj.maf
else if ( -s $VIR ) then
    cp $VIR $tmp/$chr.VirMoj.maf
else if ( -s $MOJ ) then
    cp $MOJ $tmp/$chr.VirMoj.maf
endif

if ( -s $tmp/$chr.VirMoj.maf && -s $tmp/$chr.YakAnaPse.maf ) then
    echo "Adding $tmp/$chr.VirMoj.maf..."
    $VMZ $tmp/$chr.YakAnaPse.maf $tmp/$chr.VirMoj.maf 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.VirMoj.maf ) then
    cp $tmp/$chr.VirMoj.maf $tmp/$chr.YakAnaPseVirMoj.maf
else if ( -s $tmp/$chr.YakAnaPse.maf ) then
    cp $tmp/$chr.YakAnaPse.maf $tmp/$chr.YakAnaPseVirMoj.maf
endif 

if ( -s $ANO && -s $tmp/$chr.YakAnaPseVirMoj.maf ) then
    echo "Adding $ANO..."
    $VMZ $tmp/$chr.YakAnaPseVirMoj.maf $ANO 1 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.YakAnaPseVirMojAno.maf
    cp $tmp/$chr.YakAnaPseVirMojAno.maf $DEST
else if ( -s $ANO ) then
    cp $ANO $DEST
else if ( -s $tmp/$chr.YakAnaPseVirMoj.maf ) then
    cp $tmp/$chr.YakAnaPseVirMoj.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doVarMultiz.csh
    awk '{print "./doVarMultiz.csh " $1;}' /cluster/data/dm1/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /cluster/data/dm1/bed/multiz7way
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                1098s      18.29m     0.30h    0.01d
#Longest job:                     3281s      54.68m     0.91h    0.04d
#Submission to last job:          3281s      54.68m     0.91h    0.04d

    # make /gbdb/ links to 7way maf files:
    ssh hgwdev
    mkdir -p /gbdb/dm1/multiz7way/maf/multiz7way
    ln -s /cluster/data/dm1/bed/multiz7way/maf/chr*.maf \
      /gbdb/dm1/multiz7way/maf/multiz7way/
    # -- just reuse pairwise tables/files from 8way, instead of duplicating.
    # load into database
    cd /tmp
    hgLoadMaf -warn dm1 multiz7way \
      -pathPrefix=/gbdb/dm1/multiz7way/maf/multiz7way
    # -- again, normally we would load pairwise maf tables -- reuse 8way's.

    # put 7way MAF out for download
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz7way
    mkdir ../../zips/mzMafs
    foreach f (maf/*.maf)
      gzip -c $f > ../../zips/mzMafs/$f:t.gz
    end
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/multiz7way
    cd /usr/local/apache/htdocs/goldenPath/dm1/multiz7way
    mv /cluster/data/dm1/zips/mzMafs/* .
    rmdir /cluster/data/dm1/zips/mzMafs
    md5sum *.gz > md5sum.txt
    # make a README.txt

    # Cleanup
    rm -rf /cluster/bluearc/flyVarMultiz8way/


# MEASURE MAF CDS COVERAGE FOR PHASTCONS PAPER (DONE 11/30/04 angie)
    # 11/19, before running phastCons:
    ssh kolossus
    mkdir /cluster/data/dm1/bed/multiz7way/coverage
    cd /cluster/data/dm1/bed/multiz7way/coverage
    cat ../maf/chr*.maf \
    | nice mafRanges -notAllOGap stdin dm1 dm1.7way.mafRanges.bed
    ssh hgwdev
    cd /cluster/data/dm1/bed/multiz7way/coverage
    # Use the alignment coverage of CDS in dm1 vs. hg17 as a scaling 
    # factor for phastCons target coverage, according to this equation:
    #  targetCovCds_fly = targetCovCds_vert * alCovCds_fly / alCovCds_vert
    # alCovCds_fly is the "cover" figure:
    nice featureBits -enrichment dm1 bdgpGene:cds dm1.7way.mafRanges.bed
#bdgpGene:cds 17.224%, dm1.7way.mafRanges.bed 87.811%, both 17.159%, cover 99.62%, enrich 1.13x
    # Per Adam, targetCovCds_vert = ~2/3, alCovCds_vert = 0.964
    calc 2/3 \* 0.996 / 0.964
#2/3 * 0.996 / 0.964 = 0.688797
    # OK, we will aim for a fly phastConsElements CDS coverage of 68.9% below.
    # We may have to ask for something different in the parameters, but we 
    # will keep trying out different parameters until we get an actual 
    # phastConsElements CDS coverage of 68.9%.  
    # So how did 8way fare?
    nice featureBits -enrichment dm1 bdgpGene:cds phastConsElements8way
#bdgpGene:cds 17.224%, phastConsElements8way 30.686%, both 9.712%, cover 56.39%, enrich 1.84x
    # 56% is smaller than 68.9% so we need to increase the --target-coverage 
    # param.  It was 0.20 before... let's try 0.30.  
    # 11/30: Now get pairwise:
    ssh kolossus
    cd /cluster/data/dm1/bed/multiz7way/coverage
    foreach db (dp2 anoGam1 droAna1 droMoj1 droVir1 droYak1)
      cat /cluster/data/dm1/bed/blastz.$db/mafNet/*.maf \
      | mafRanges -notAllOGap stdin dm1 dm1.$db.mafRanges.bed
    end
    ssh hgwdev
    cd /cluster/data/dm1/bed/multiz7way/coverage
    foreach db (dp2 anoGam1 droAna1 droMoj1 droVir1 droYak1)
      echo $db
      nice featureBits dm1 -enrichment bdgpGene:cds dm1.$db.mafRanges.bed \
        -bed=dm1.$db.cds.bed
    end
#dp2
#bdgpGene:cds 17.224%, dm1.dp2.mafRanges.bed 58.121%, both 15.791%, cover 91.68%, enrich 1.58x
#anoGam1
#bdgpGene:cds 17.224%, dm1.anoGam1.mafRanges.bed 14.587%, both 11.003%, cover 63.88%, enrich 4.38x
#droAna1
#bdgpGene:cds 17.224%, dm1.droAna1.mafRanges.bed 65.118%, both 16.157%, cover 93.80%, enrich 1.44x
#droMoj1
#bdgpGene:cds 17.224%, dm1.droMoj1.mafRanges.bed 39.094%, both 15.033%, cover 87.28%, enrich 2.23x
#droVir1
#bdgpGene:cds 17.224%, dm1.droVir1.mafRanges.bed 42.200%, both 15.258%, cover 88.59%, enrich 2.10x
#droYak1
#bdgpGene:cds 17.224%, dm1.droYak1.mafRanges.bed 85.119%, both 16.935%, cover 98.32%, enrich 1.16x


# PHASTCONS 7WAY (DONE 11/19/04 angie)
    ssh kksilo
    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/dm1/chrom
    cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz7way/phastCons
    mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.split
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.split
    set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm1/chrom
set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.maf
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm1/bed/multiz7way/maf/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
Completed: 11 of 11 jobs
Average job time:                  42s       0.69m     0.01h    0.00d
Longest job:                       96s       1.60m     0.03h    0.00d
Submission to last job:            96s       1.60m     0.03h    0.00d

    ssh kksilo
    cd /cluster/data/dm1/bed/multiz7way/phastCons/
    # Get genome-wide average GC content (for all species together,
    # not just the reference genome).  If you have a globally
    # estimated tree model, as above, you can get this from the
    # BACKGROUND line in the .mod file.  E.g.,
# ALPHABET: A C G T
# ...
# BACKGROUND: 0.276938 0.223190 0.223142 0.276730
    # add up the C and G:
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
    # Great, use 0.446 as the --gc parameter in phastCons below:.

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # use the model previously estimated as a starting model...
    cp ../../multiz8way/phastCons/run.estimate/ave.noncons.mod \
      starting-tree.mod 
    # Remove apiMel1 to get 7-way tree.
    $EDITOR starting-tree.mod
#TREE: (((((dm1:0.058,droYak1:0.074):0.1,droAna1:0.17):0.33,dp2:0.200):0,(droVir1:0.3,droMoj1:0.3):0):0.2,anoGam1:0.5);

    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  Run that on the previously 
    # estimated models, with the params used to estimate them:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.2 25 \
      ../../multiz8way/phastCons/run.estimate/ave.{cons,noncons}.mod
#
#( Solving for new omega: 25.000000 14.959772 12.585290 12.339417 12.336344 )
#
#Transition parameters: gamma=0.200000, omega=25.000000, mu=0.040000, nu=0.010000
#Relative entropy: H=1.319603 bits/site
#Required length: N=8.794103 sites
#Total entropy: NH=11.604726 bits
#Recommended expected length: omega=12.336344 sites (for NH=9.783400)
    # OK, we will try --expected-lengths 12.3, although since we're also 
    # changing --target-coverage (see above) we probably won't get it 
    # right the first time.  
    mkdir run.estimate

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.4 18 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.400000, omega=18.000000, mu=0.055556, nu=0.037037
#Relative entropy: H=1.355321 bits/site
#Required length: N=6.620854 sites
#Total entropy: NH=8.973382 bits
#Recommended expected length: omega=23.837201 sites (for NH=9.783400)
    # OK, bump up --expected-lengths to 24 for second iteration.
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.5 24 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.500000, omega=24.000000, mu=0.041667, nu=0.041667
#Relative entropy: H=1.235921 bits/site
#Required length: N=7.320146 sites
#Total entropy: NH=9.047124 bits
#Recommended expected length: omega=30.685782 sites (for NH=9.783400)
    # OK, bump up --expected-lengths to 31 for third iteration.
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.6 31 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.600000, omega=31.000000, mu=0.032258, nu=0.048387
#Relative entropy: H=1.227887 bits/site
#Required length: N=7.351105 sites
#Total entropy: NH=9.026325 bits
#Recommended expected length: omega=39.549314 sites (for NH=9.783400)
    # OK, --expected-lengths 40.
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.57 40 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.570000, omega=40.000000, mu=0.025000, nu=0.033140
#Relative entropy: H=1.223351 bits/site
#Required length: N=8.217344 sites
#Total entropy: NH=10.052700 bits
#Recommended expected length: omega=36.620015 sites (for NH=9.783400)
    # OK, --expected-lengths 36.6.
    # SIXTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.56 36.6 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.560000, omega=36.600000, mu=0.027322, nu=0.034774
#Relative entropy: H=1.227349 bits/site
#Required length: N=8.033142 sites
#Total entropy: NH=9.859466 bits
#Recommended expected length: omega=35.696766 sites (for NH=9.783400)
    # OK, --expected-lengths 35.7.

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 12.3 --target-coverage 0.30 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 35.7 --target-coverage 0.56 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk9
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                 662s      11.03m     0.18h    0.01d
#Longest job:                     1477s      24.62m     0.41h    0.02d
#Submission to last job:          1574s      26.23m     0.44h    0.02d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.020863,droYak1:0.024051):0.033187,droAna1:0.063531):0.018324,dp2:0.066908):0.019698,(droVir1:0.039583,droMoj1:0.044682):0.053433):0.106049,anoGam1:0.106049);
#ave.noncons.mod:TREE: (((((dm1:0.090685,droYak1:0.105448):0.150077,droAna1:0.288625):0.084164,dp2:0.304556):0.092083,(droVir1:0.176988,droMoj1:0.200279):0.253431):0.490731,anoGam1:0.490731);
    # SECOND ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.024527,droYak1:0.028187):0.039154,droAna1:0.074652):0.021897,dp2:0.078698):0.023959,(droVir1:0.047008,droMoj1:0.053078):0.062285):0.126590,anoGam1:0.126590);
#ave.noncons.mod:TREE: (((((dm1:0.093917,droYak1:0.108899):0.155960,droAna1:0.298758):0.088657,dp2:0.315328):0.098722,(droVir1:0.185255,droMoj1:0.209600):0.259606):0.516221,anoGam1:0.516221);
    # THIRD ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.026164,droYak1:0.030017):0.041812,droAna1:0.079557):0.023492,dp2:0.083908):0.025849,(droVir1:0.050196,droMoj1:0.056691):0.066111):0.135440,anoGam1:0.135440);
#ave.noncons.mod:TREE: (((((dm1:0.097274,droYak1:0.112628):0.161761,droAna1:0.309265):0.092448,dp2:0.326497):0.103541,(droVir1:0.192162,droMoj1:0.217414):0.267607):0.536500,anoGam1:0.536500);
    # FOURTH ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.027759,droYak1:0.031804):0.044422,droAna1:0.084321):0.025065,dp2:0.088941):0.027674,(droVir1:0.053244,droMoj1:0.060132):0.069757):0.143307,anoGam1:0.143307);
#ave.noncons.mod:TREE: (((((dm1:0.100831,droYak1:0.116583):0.167801,droAna1:0.320024):0.096332,dp2:0.337862):0.108245,(droVir1:0.199102,droMoj1:0.225211):0.275450):0.553926,anoGam1:0.553926);
    # FIFTH ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.027267,droYak1:0.031253):0.043621,droAna1:0.082864):0.024584,dp2:0.087408):0.027120,(droVir1:0.052313,droMoj1:0.059084):0.068661):0.141041,anoGam1:0.141041);
#ave.noncons.mod:TREE: (((((dm1:0.099838,droYak1:0.115485):0.166143,droAna1:0.317118):0.095263,dp2:0.334808):0.106969,(droVir1:0.197211,droMoj1:0.223103):0.273475):0.549842,anoGam1:0.549842);
    # SIXTH ITERATION:
#ave.cons.mod:TREE: (((((dm1:0.027105,droYak1:0.031070):0.043350,droAna1:0.082377):0.024423,dp2:0.086892):0.026930,(droVir1:0.052002,droMoj1:0.058734):0.068290):0.140237,anoGam1:0.140237);
#ave.noncons.mod:TREE: (((((dm1:0.099464,droYak1:0.115066):0.165490,droAna1:0.315987):0.094859,dp2:0.333609):0.106465,(droVir1:0.196484,droMoj1:0.222286):0.272655):0.548007,anoGam1:0.548007);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off:
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.30 12.3 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.300000, omega=12.300000, mu=0.081301, nu=0.034843
#Relative entropy: H=1.355321 bits/site
#Required length: N=6.455682 sites
#Total entropy: NH=8.749521 bits
#Recommended expected length: omega=17.959820 sites (for NH=9.783400)
    # OK, bump up --expected-lengths to 18 below (and for next iteration).
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.40 24 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.400000, omega=24.000000, mu=0.041667, nu=0.027778
#Relative entropy: H=1.235921 bits/site
#Required length: N=7.943664 sites
#Total entropy: NH=9.817744 bits
#Recommended expected length: omega=23.714197 sites (for NH=9.783400)
    # OK, looks like 24 is still about right for --expected-lengths having 
    # bumped up --target-coverage.  
    # THIRD ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.50 31 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.500000, omega=31.000000, mu=0.032258, nu=0.032258
#Relative entropy: H=1.227887 bits/site
#Required length: N=7.992415 sites
#Total entropy: NH=9.813781 bits
#Recommended expected length: omega=30.685778 sites (for NH=9.783400)
    # Cool, 31 is still about right.
    # FOURTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.60 40 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.600000, omega=40.000000, mu=0.025000, nu=0.037500
#3Relative entropy: H=1.223351 bits/site
#Required length: N=8.025354 sites
#Total entropy: NH=9.817829 bits
#Recommended expected length: omega=39.556330 sites (for NH=9.783400)
    # 40 still OK, good.
    # FIFTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.57 36.6 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.570000, omega=36.600000, mu=0.027322, nu=0.036218
#Relative entropy: H=1.227349 bits/site
#Required length: N=7.970096 sites
#Total entropy: NH=9.782087 bits
#Recommended expected length: omega=36.615726 sites (for NH=9.783400)
    # Yup, --expected-lengths 36.6.
    # SIXTH ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.7 ave.{cons,noncons}.mod
#Transition parameters: gamma=0.560000, omega=35.700000, mu=0.028011, nu=0.035651
#Relative entropy: H=1.227599 bits/site
#Required length: N=7.969781 sites
#Total entropy: NH=9.783694 bits
#Recommended expected length: omega=35.696550 sites (for NH=9.783400)
    # We probably don't need to do this, since it always has agreed, but it 
    # only takes a second.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.phast
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 35.7 --target-coverage 0.56 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                  17s       0.29m     0.00h    0.00d
#Longest job:                       28s       0.47m     0.01h    0.00d
#Submission to last job:            34s       0.57m     0.01h    0.00d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/dm1/bed/multiz7way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    cd /cluster/data/dm1/bed/multiz7way/phastCons
    featureBits -enrichment dm1 bdgpGene:cds all.bed
    # FIRST ITERATION: too low; increase --target-coverage and re-estimate.
#bdgpGene:cds 17.224%, all.bed 31.047%, both 9.866%, cover 57.28%, enrich 1.85x
    # SECOND ITERATION: still too low; increase --target-coverage, re-estimate.
#bdgpGene:cds 17.224%, all.bed 35.450%, both 10.948%, cover 63.56%, enrich 1.79x
    # THIRD ITERATION: close!  Still a little low, repeat.
#bdgpGene:cds 17.224%, all.bed 38.729%, both 11.542%, cover 67.01%, enrich 1.73x
    # FOURTH ITERATION: OK, a bit high now; back off and repeat.
#bdgpGene:cds 17.224%, all.bed 42.087%, both 12.076%, cover 70.11%, enrich 1.67x
    # FIFTH ITERATION: back off just a smidge more.
#bdgpGene:cds 17.224%, all.bed 41.098%, both 11.927%, cover 69.25%, enrich 1.68x
    # SIXTH ITERATION: woohoo!
#bdgpGene:cds 17.224%, all.bed 40.753%, both 11.871%, cover 68.92%, enrich 1.69x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed dm1 phastConsElements7way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz7way/phastCons/run.wib
    cd /cluster/data/dm1/bed/multiz7way/phastCons/run.wib
    rm -rf /cluster/bluearc/dm1/phastCons/wib
    mkdir -p /cluster/bluearc/dm1/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /cluster/bluearc/dm1/phastCons/wib
zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest job:                       22s       0.37m     0.01h    0.00d
#Submission to last job:            22s       0.37m     0.01h    0.00d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/dm1/bed/multiz7way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm1/phastCons/wib .
    rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm1/multiz7way/wib
    cd /cluster/data/dm1/bed/multiz7way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz7way/wib/
    hgLoadWiggle dm1 phastCons7way \
      -pathPrefix=/gbdb/dm1/multiz7way/wib wib/*.wig

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-7way \
      "top 5000 conserved elements (7way)" dm1

    # and clean up bluearc.
    rm -r /cluster/bluearc/dm1/phastCons
# NOT DONE (I see more phastCons in the future :)
    rm -r /cluster/bluearc/dm1/chrom


# FLYREG (DONE 12/13/04 angie)
    # Flyreg loaded 11/22/04; MEME motifs added 12/13/04
    ssh kksilo
    mkdir /cluster/data/dm1/bed/flyreg
    cd /cluster/data/dm1/bed/flyreg
    wget http://www.gen.cam.ac.uk/casey/Data/Footprint.GFF
    # This is not GTF; it should really be bed +.  The contributor, 
    # Casey Bergman, says that coords are 0-based half-open, so 
    # translation will be even easier.
    grep -v '^#' Footprint.GFF \
    | perl -wpe 'if (! s/^(\S+)\tBergman_data\tbinding_site\t(\d+)\t(\d+)\t.\t.\t.\tFactor \"([^\"]+)\"; Target \"([^\"]+)\"; PMID \"(\d+)\"$/chr$1\t$2\t$3\t$4\t$5\t$6/) { die "Cant parse line $.:\n$_\n"; }' \
      > flyreg.bed
    ssh hgwdev
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/flyreg.sql \
      dm1 flyreg /cluster/data/dm1/bed/flyreg/flyreg.bed
    # 12/13/04: Add Dan Pollard's MEME motif data for the footprints:
    ssh kksilo
    cd /cluster/data/dm1/bed/flyreg
    wget http://rana.lbl.gov/~dan/matrices/bergman2004/footprint_matrices.txt
    ./extractMatrices.pl footprint_matrices.txt > flyregMotif.tab
    ssh hgwdev
    cd /cluster/data/dm1/bed/flyreg
    sed -e 's/dnaMotif/flyregMotif/' $HOME/kent/src/hg/lib/dnaMotif.sql \
    > flyregMotif.sql
    hgsql dm1 < flyregMotif.sql
    hgsql dm1 -e 'load data local infile "flyregMotif.tab" into table flyregMotif'


# BLASTZ D.ERECTA (DONE 11/24/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.droEre1.2004-11-24
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24
    ln -s blastz.droEre1.2004-11-24 /cluster/data/dm1/bed/blastz.droEre1
    cat << '_EOF_' > DEF
# D.melanogaster vs. D.erecta
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_SMSK=
SEQ1_FLAG=-drosophila
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - D. erecta
SEQ2_DIR=/iscratch/i/droEre1/chunks
SEQ2_SMSK=
SEQ2_FLAG=-drosophila
SEQ2_IN_CONTIGS=1
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/dm1/bed/blastz.droEre1.2004-11-24

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

'_EOF_'
    # << this line keeps emacs coloring happy
    # run bash shell if you don't already:
    bash
    source DEF
    mkdir run
    /cluster/bin/scripts/blastz-make-joblist $DEF > $BASE/run/j
    sh ./xdir.sh
    cd run
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    mv j2 j
    # cluster run
    ssh kk
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run
    para create j
    para try, check, push, check, ....
#Completed: 5460 of 5460 jobs
#Average job time:                  46s       0.76m     0.01h    0.00d
#Longest job:                     1230s      20.50m     0.34h    0.01d
#Submission to last job:          1401s      23.35m     0.39h    0.02d

    # back in the bash shell on kksilo...
    mkdir /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE > j
    # small cluster run
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/run.1
    para create j
    para try, check, push, check, ....
#Completed: 21 of 21 jobs
#Average job time:                  15s       0.26m     0.00h    0.00d
#Longest job:                       33s       0.55m     0.01h    0.00d
#Submission to last job:            49s       0.82m     0.01h    0.00d

    # Translate .lav to axt:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24
    rm -r raw
    mkdir axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin /cluster/data/dm1/nib \
            /cluster/data/droEre1/droEre1.2bit stdout \
        | axtSort stdin ../../$out
      popd
    end


# CHAIN ERECTA BLASTZ (DONE 11/24/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir chain
    ls -1S /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -verbose=0 $1 \
  /iscratch/i/dm1/nib \
  /iscratch/i/droEre1/droEre1.2bit stdout \
| chainAntiRepeat /iscratch/i/dm1/nib /iscratch/i/droEre1/droEre1.2bit \
    stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                   8s       0.14m     0.00h    0.00d
#Longest job:                       23s       0.38m     0.01h    0.00d
#Submission to last job:            23s       0.38m     0.01h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain dm1 ${c}_chainDroEre1 $i
    end


# NET ERECTA BLASTZ (DONE 11/24/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain
    netClass -noAr noClass.net dm1 droEre1 erecta.net 

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain
    rm noClass.net
    netFilter -syn erecta.net > erectaSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/dm1/bed/blastz.droEre1.2004-11-24/axtChain
    netFilter -minGap=10 erecta.net |  hgLoadNet dm1 netDroEre1 stdin
    netFilter -minGap=10 erectaSyn.net \
    | hgLoadNet dm1 netSyntenyDroEre1 stdin


# GENERATE DROERE1 AXTNET AND MAF FOR MULTIZ (DONE 11/24/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1/axtChain
    netSplit erecta.net net
    cd ..
    mkdir axtNet
    foreach f (axtChain/net/chr*.net)
      netToAxt $f axtChain/chain/$f:t:r.chain \
        /cluster/data/dm1/nib /cluster/data/droEre1/droEre1.2bit stdout \
      | axtSort stdin axtNet/$f:t:r.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/dm1/chrom.sizes /cluster/data/droEre1/chrom.sizes \
            $maf -tPrefix=dm1. -qPrefix=droEre1.
    end


# MAKE VSDROERE1 DOWNLOADABLES (DONE 11/24/04 angie)
    ssh kksilo
    cd /cluster/data/dm1/bed/blastz.droEre1
    nice gzip axtChain/{all.chain,*.net} axtNet/*.axt axtChrom/*.axt
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/dm1/vsDroEre1
    cd /usr/local/apache/htdocs/goldenPath/dm1/vsDroEre1
    cp -p /cluster/data/dm1/bed/blastz.droEre1/axtChain/all.chain.gz \
      erecta.chain.gz
    cp -p /cluster/data/dm1/bed/blastz.droEre1/axtChain/erecta.net.gz .
    cp -pR /cluster/data/dm1/bed/blastz.droEre1/axtNet .
    md5sum *.gz */*.gz > md5sum.txt
    # Make a README.txt which explains the files & formats.


# TWINSCAN (DONE 11/29/04 angie)
    ssh kksilo
    mkdir /cluster/data/dm1/bed/twinscan
    cd /cluster/data/dm1/bed/twinscan
    foreach c (2L 2R 3L 3R 4 X)
      wget http://genes.cs.wustl.edu/predictions/Drosophila/dm1_11-24-04/chr_gtf/chr$c.gtf
      wget http://genes.cs.wustl.edu/predictions/Drosophila/dm1_11-24-04/chr_ptx/chr$c.ptx
    end
    # Add '.a' to end of protein fasta id's, to match gtf transcript_id's:
    perl -wpe 's/^(>\S+).*/$1.a/' *.ptx > twinscanPep.fa
    # load.
    ssh hgwdev
    cd /cluster/data/dm1/bed/twinscan
    ldHgGene -gtf -genePredExt dm1 twinscan chr*.gtf
    hgPepPred dm1 generic twinscanPep twinscanPep.fa
    featureBits -enrichment dm1 refGene twinscan
#refGene 22.303%, twinscan 16.410%, both 15.755%, cover 70.64%, enrich 4.30x

# MAF COVERAGE, MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES, FOR
# REVISED PHASTCONS PAPER (DONE 1/5/05 acs)

    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/coverage
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/coverage
    cat ../mypa/chr*.maf \
    | nice mafRanges -notAllOGap stdin dm1 dm1.4way.mafRanges.bed
    nice featureBits -enrichment dm1 bdgpGene:cds dm1.4way.mafRanges.bed
# bdgpGene:cds 17.224%, dm1.4way.mafRanges.bed 86.879%, both 17.142%, cover 99.53%, enrich 1.15x

# FEEP EXPRESSION DATA (DONE 1/28/05 angie)
    # Full Euchromatic Expression Profile data (Stolc et al. 2004, White lab)
    # Raw data files from http://genome.med.yale.edu/FEEP/ downloaded to 
    # /projects/compbio/data/microarray/flyFEEP/ .
    # Used ad hoc perl scripts to massage the data files into hgFixed 
    # expression tables -- see 
    # /projects/compbio/data/microarray/flyFEEP/README .
    # See makeHgFixed.doc for how *MedianRatio table was extracted.  
    ssh hgwdev
    mkdir /cluster/data/dm1/bed/flyFeep
    cd /cluster/data/dm1/bed/flyFeep
    # More perling to get bed coords; then hgMapMicroarray to make bed 15:
    ./probes2bed.pl /projects/compbio/data/microarray/flyFEEP/probes3.1-* \
      > flyFeepProbesBed12.bed
    hgMapMicroarray -bedIn flyFeep.bed hgFixed.flyFeepMedianRatio \
      flyFeepProbesBed12.bed
    hgLoadBed dm1 flyFeep flyFeep.bed
    # "Significant" set of probes:
    # (see http://bussemaker.bio.columbia.edu/papers/Science2004/peab_discussion.html)
    wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_EPs_NEPs_GC10-11_usedinpaper/EPNEP_GC10-11_GC3-8crxn_FDR05_none_pool24_all_EP
    wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_EPs_NEPs_GC10-11_usedinpaper/EPNEP_GC10-11_GC3-8crxn_FDR05_none_pool24_all_NEP
    wget http://bussemaker.bio.columbia.edu/papers/Science2004/significant_SJPs/SJP_WSJP_GC3-8crxn_FDR05_none_pool24_all
    # ANOVA set (no splice junction probes):
    wget http://bussemaker.bio.columbia.edu/papers/Science2004/anova/EP_diff_expressed
    wget http://bussemaker.bio.columbia.edu/papers/Science2004/anova/NEP_diff_expressed
    # Distill those files to lists of IDs; then use the IDs to winnow 
    # flyFeep.bed.
    awk '{print $1;}' *FDR05* > probesExpressedAboveBackground.txt
    awk '{print $1;}' *_diff_expressed > probesAnovaDiffExpressed.txt
    ./winnowBed.pl probesExpressedAboveBackground.txt flyFeep.bed \
      > flyFeepPEAB.bed
    ./winnowBed.pl probesAnovaDiffExpressed.txt flyFeep.bed \
      > flyFeepAnova.bed
    hgLoadBed dm1 flyFeepPEAB flyFeepPEAB.bed
    hgLoadBed dm1 flyFeepAnova flyFeepAnova.bed


# PHASTCONS 8WAY WITH METHODS FROM PAPER (DONE 1/26/05 angie)
    # Same procedure as for 7way above -- using the param estimation methods 
    # described in Adam's Genome Research paper.
    # Move aside the previous phastCons run dir:
    ssh hgwdev
    mv /cluster/data/dm1/bed/multiz8way/phastCons \
      /cluster/data/dm1/bed/multiz8way/phastCons.2004-11-29
    # That breaks /gbdb/links -- fix temporarily, we'll overwrite when done:
    rm /gbdb/dm1/multiz8way/wib/*
    ln -s /cluster/store6/dm1/bed/multiz8way.2004-11-09/phastCons.2004-11-29/wib/chr*.wib /gbdb/dm1/multiz8way/wib
    # chrom fa is already in /cluster/bluearc/dm1/chrom/, no need to copy.
    # Split chrom fa into smaller windows for phastCons:
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.split
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.split
    set WINDOWS = /cluster/bluearc/dm1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm1/chrom
set WINDOWS=/cluster/bluearc/dm1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
# apiMel1-specific tweak for msa_split:
perl -wpe 's/(apiMel1\.Group\w+)\.(\d+)/$1_$2/' $1 > $tmpDir/$c.maf
${PHAST}/msa_split $tmpDir/$c.maf -i MAF -M ${FA_SRC}/$c.fa \
   -O dm1,droYak1,droAna1,dp2,droVir1,droMoj1,anoGam1,apiMel1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.maf
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm1/bed/multiz8way/maf/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 11 of 11 jobs
#Average job time:                  49s       0.81m     0.01h    0.00d
#Longest job:                      147s       2.45m     0.04h    0.00d
#Submission to last job:           158s       2.63m     0.04h    0.00d

    ssh kksilo
    cd /cluster/data/dm1/bed/multiz8way/phastCons/
    # Use the previous 8way run's ave.noncons.mod as starting model:
    cp ../phastCons.2004-11-29/run.estimate/ave.noncons.mod starting-tree.mod
    # Get genome-wide all-species average GC content from starting-tree.mod:
    # [when starting fresh, can use msa_view --aggregate --summary-only 
    #  as in makeHg17.doc but that's slower]
    awk '$1 == "BACKGROUND:" {printf "%0.3f\n", $3 + $4;}' starting-tree.mod
#0.446
    # Use 0.446 as the --gc parameter in phastCons below:.

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # Use the values of --target-coverage and --expected-lengths from the 
    # last iteration of the 7way run, 0.56 and 35.7.  
    # Use the 7-way estimated models.  Should be a real headstart relative 
    # to starting from scratch (in which case we would use phyloFit).
    /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.7 \
      ../../multiz8way/phastCons.2004-11-29/run.estimate/ave.{cons,noncons}.mod
#( Solving for new omega: 35.700000 35.623093 )
#Transition parameters: gamma=0.560000, omega=35.700000, mu=0.028011, nu=0.035651
#Relative entropy: H=1.319603 bits/site
#Required length: N=7.418872 sites
#Total entropy: NH=9.789966 bits
#Recommended expected length: omega=35.623093 sites (for NH=9.783400)
    # OK, we will try --expected-lengths 35.6.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.565 35.5 ave.{cons,noncons}.mod
#Recommended expected length: omega=35.982083 sites (for NH=9.783400)
    # OK, --expected-lengths 36.

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh kk9
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 35.6 --target-coverage 0.56 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 36 --target-coverage 0.565 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                1076s      17.93m     0.30h    0.01d
#Longest job:                     2118s      35.30m     0.59h    0.02d
#Submission to last job:          2791s      46.52m     0.78h    0.03d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kksilo
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: ((((((dm1:0.028206,droYak1:0.033232):0.044076,droAna1:0.086199):0.023901,dp2:0.091206):0.023263,(droVir1:0.053709,droMoj1:0.060420):0.074761):0.080532,anoGam1:0.202889):0.143734,apiMel1:0.143734);
#ave.noncons.mod:TREE: ((((((dm1:0.096925,droYak1:0.114585):0.159114,droAna1:0.310607):0.088457,dp2:0.329123):0.088473,(droVir1:0.190784,droMoj1:0.215082):0.280416):0.306475,anoGam1:0.738306):0.535906,apiMel1:0.535906);
    # SECOND ITERATION:
#ave.cons.mod:TREE: ((((((dm1:0.028359,droYak1:0.033409):0.044355,droAna1:0.086718):0.024081,dp2:0.091763):0.023499,(droVir1:0.054045,droMoj1:0.060801):0.075211):0.081326,anoGam1:0.204280):0.144740,apiMel1:0.144740);
#ave.noncons.mod:TREE: ((((((dm1:0.097342,droYak1:0.115064):0.159904,droAna1:0.312067):0.088999,dp2:0.330694):0.089234,(droVir1:0.191751,droMoj1:0.216177):0.281661):0.309045,anoGam1:0.742389):0.538846,apiMel1:0.538846);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off.
    # We probably don't need to do this, since it has always been very close 
    # if not the same as what we used above, but it only takes a second.
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.56 35.6 ave.{cons,noncons}.mod
#Recommended expected length: omega=35.540632 sites (for NH=9.783400)
    # OK, tweak --expected-lengths to 35.5 below (and for next iteration).
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.565 36 ave.{cons,noncons}.mod
#Recommended expected length: omega=35.981000 sites (for NH=9.783400)
    # ==> keep at 36.

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.phast
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 36 --target-coverage 0.565 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /cluster/bluearc/dm1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 134 of 134 jobs
#Average job time:                  16s       0.27m     0.00h    0.00d
#Longest job:                       23s       0.38m     0.01h    0.00d
#Submission to last job:            47s       0.78m     0.01h    0.00d

    # back on kksilo:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /cluster/bluearc/dm1/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    featureBits -enrichment dm1 bdgpGene:cds all.bed
    # FIRST ITERATION: a bit too low; increase --target-coverage, re-estimate.
#bdgpGene:cds 17.224%, all.bed 42.260%, both 11.796%, cover 68.49%, enrich 1.62x
    # SECOND ITERATION: 68.85%'ll do!
#bdgpGene:cds 17.224%, all.bed 42.608%, both 11.859%, cover 68.85%, enrich 1.62x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed dm1 phastConsElements8way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/dm1/bed/multiz8way/phastCons/run.wib
    cd /cluster/data/dm1/bed/multiz8way/phastCons/run.wib
    rm -rf /cluster/bluearc/dm1/phastCons/wib
    mkdir -p /cluster/bluearc/dm1/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /cluster/bluearc/dm1/phastCons/wib
zcat `ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 11 of 11 jobs
#Average job time:                  20s       0.34m     0.01h    0.00d
#Longest job:                      114s       1.90m     0.03h    0.00d
#Submission to last job:           114s       1.90m     0.03h    0.00d

    # back on kksilo, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm1/phastCons/wib .
    rsync -av /cluster/bluearc/dm1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/dm1/multiz8way/wib
    cd /cluster/data/dm1/bed/multiz8way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/multiz8way/wib/
    hgLoadWiggle dm1 phastCons8way \
      -pathPrefix=/gbdb/dm1/multiz8way/wib wib/*.wig

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/dm-top5000-8way \
      "top 5000 conserved elements (8way)" dm1

    # and clean up bluearc.
    rm -r /cluster/bluearc/dm1/phastCons/{ELEMENTS,POSTPROBS,wib}
# NOT DONE (I see more phastCons in the future :)
    rm -r /cluster/bluearc/dm1/chrom


# SELF ALIGNMENTS (DONE 2/24/05 angie)
    # Doing this partly as a test of doBlastzChainNet.pl...
    ssh kksilo
    mkdir /cluster/data/dm1/bed/blastz.dm1.2005-02-24
    cd /cluster/data/dm1/bed/blastz.dm1.2005-02-24
    cat << '_EOF_' > DEF
# D. melanogaster vs. self

# TARGET - D. melanogaster
SEQ1_DIR=/iscratch/i/dm1/nib
SEQ1_CHUNK=5000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/dm1/chrom.sizes

# QUERY - D. melanogaster
SEQ2_DIR=/iscratch/i/dm1/nib
SEQ2_CHUNK=5000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/dm1/chrom.sizes

BASE=/cluster/data/dm1/bed/blastz.dm1.2005-02-24

'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /panasas/store/dm1SelfOut >& do.log &
    tail -f do.log
    rmdir /panasas/store/dm1SelfOut
    ln -s blastz.dm1.2005-02-24 /cluster/data/dm1/bed/blastz.dm1
    # chainSelf is already in top-level trackDb.ra, so no need to add.
    # Add /usr/local/apache/htdocs/goldenPath/dm1/vsSelf/README.txt

    # So, similar to above, we'll shoot for 2/3 * 0.995 / 0.964 =
    # 68.8% coverage in coding regions

    # NOTE: the pairwise numbers are the same as before

# STANDARDIZED VERSION OF PHASTCONS
# MELANOGASTER/YAKUBA/PSEUDOOBSCURA/ANOPHELES, FOR PHASTCONS PAPER
# (DONE 1/5/05 acs)

    # copy chrom fa to bluearc, break up the genome-wide MAFs into pieces
    mkdir -p /cluster/bluearc/dm1/chrom
    cp -p /cluster/data/dm1/?{,?}/chr*.fa /cluster/bluearc/dm1/chrom/
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.split
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.split
    set WINDOWS = /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/cluster/bluearc/dm1/chrom
set WINDOWS=/cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $maf -i MAF -M ${FA_SRC}/$c.fa -O dm1,droYak1,dp2,anoGam1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/mypa/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in line+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
    cd ..

    # now estimate parameters.  we have to iterate until we get
    # coverage of 0.688 in exons and relative entropy 9.78, as above.

    # we can start the process by seeing how close we were with our
    # previous estimates (target coverage 0.2, expected length 25)
    featureBits -enrichment dm1 bdgpGene:cds ../../phastCons/all.bed
# bdgpGene:cds 17.224%, ../../phastCons/all.bed 29.326%, both 7.519%, cover 43.65%, enrich 1.49x
    consEntropy 0.2 25 ../../phastCons/run.estimate/ave.{cons,noncons}.mod
# Transition parameters: gamma=0.200000, omega=25.000000, mu=0.040000, nu=0.010000
# Relative entropy: H=0.623983 bits/site
# Required length: N=19.348735 sites
# Total entropy: NH=12.073289 bits
    # so, we need to crank the coverage up quite a bit and reduce the
    # entropy.  We'll try target coverage 0.4 and we'll leave the
    # expected length as it is because the entropy will tend to
    # decrease with a large increase in coverage

    mkdir run.estimate
    cd run.estimate
    # use previous results for starting model and GC content
    cp /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastCons/run.estimate/ave.noncons.mod starting-tree.mod
    #(GC content is 0.446)

    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - starting-tree.mod --gc 0.446 --nrates 1,1 \
    --no-post-probs --ignore-missing --expected-lengths 25 \
    --target-coverage 0.40 --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f ($WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    ssh kk9
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.estimate
    para create jobList
    para try, check, push, check, ...
# Completed: 134 of 134 jobs
# CPU time in finished jobs:      10970s     182.84m     3.05h    0.13d  0.000 y
# IO & Wait Time:                   361s       6.01m     0.10h    0.00d  0.000 y
# Average job time:                  85s       1.41m     0.02h    0.00d
# Longest job:                      171s       2.85m     0.05h    0.00d
# Submission to last job:           228s       3.80m     0.06h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
# ave.cons.mod:TREE: (((dm1:0.022875,droYak1:0.029296):0.045785,dp2:0.080314):0.111872,anoGam1:0.111872);
# ave.noncons.mod:TREE: (((dm1:0.093309,droYak1:0.119704):0.192065,dp2:0.337507):0.475498,anoGam1:0.475498);
    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.  See notes above.

    # Now set up the cluster job for computing the conservation scores
    # and predicted elements.
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.phast
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 25 --target-coverage 0.40 --quiet --seqname $chr \
    --idpref $pref \
    --viterbi /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
# Completed: 134 of 134 jobs
# CPU time in finished jobs:       1530s      25.50m     0.42h    0.02d  0.000 y
# IO & Wait Time:                   390s       6.50m     0.11h    0.00d  0.000 y
# Average job time:                  14s       0.24m     0.00h    0.00d
# Longest job:                       20s       0.33m     0.01h    0.00d
# Submission to last job:            42s       0.70m     0.01h    0.00d
    
    # check coverage and entropy
    cat /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/ELEMENTS/*.bed > all.raw.bed
    featureBits -enrichment dm1 bdgpGene:cds all.raw.bed
# bdgpGene:cds 17.224%, all.raw.bed 40.474%, both 10.610%, cover 61.60%, enrich 1.52x
    consEntropy --NH 9.78 0.4 25 ../run.estimate/ave.{cons,noncons}.mod
# Relative entropy: H=0.702265 bits/site
# Required length: N=14.324859 sites
# Total entropy: NH=10.059845 bits
# Recommended expected length: omega=22.594418 sites (for NH=9.780000)

    # now iterate until convergence
    # second try: --target-coverage 0.55 --expected-lengths 25
    # third try: --target-coverage 0.50 --expected-lengths 30
    # fourth try: --target-coverage 0.47 --expected-lengths 30
    # fourth try: --target-coverage 0.468 --expected-lengths 28.1

    # final results:
# bdgpGene:cds 17.224%, all.raw.bed 45.385%, both 11.805%, cover 68.54%, enrich 1.51x
# Transition parameters: gamma=0.468000, omega=28.100000, mu=0.035587, nu=0.031306
# Relative entropy: H=0.725136 bits/site
# Required length: N=13.512072 sites
# Total entropy: NH=9.798094 bits
# ave.cons.mod:TREE: (((dm1:0.024397,droYak1:0.031171):0.049856,dp2:0.086316):0.125701,anoGam1:0.125701);
# ave.noncons.mod:TREE: (((dm1:0.098596,droYak1:0.126138):0.207382,dp2:0.359019):0.531038,anoGam1:0.531038);

    # transform scores to be in 0-1000 interval then load track
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      all.raw.bed | /cluster/bin/scripts/lodToBedScore > all.bed
    hgLoadBed dm1 phastConsElementsPaper all.bed

    # Create wiggle on the small cluster
    mkdir /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd/phastCons/run.wib
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd//phastCons/run.wib
    rm -rf /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib
    mkdir -p /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib
zcat `ls -1 /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastConsPaper.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    ssh kk9, para create jobList, etc.

    # copy wibs, wigs and POSTPROBS from bluearc 
    cd /cluster/data/dm1/bed/multiz.dm1droYak1dp2anoGam1/phastConsStd
    rm -rf wib POSTPROBS
    rsync -av /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/wib .
    rsync -av /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    mkdir /gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper/
    hgLoadWiggle dm1 phastConsPaper \
      -pathPrefix=/gbdb/dm1/wib/mzDy1Dp2Ag1_phastPaper wib/*.wig

    # create rest of Conservation track (duplicate of current 4-way)
    # setup external files for database reference
    # (reuse links in /gbdb/dm1/mzDy1Dp2Ag1_phast)
    # load into database
    hgLoadMaf -warn -pathPrefix=/gbdb/dm1/mzDy1Dp2Ag1_phast dm1 multizForPhastConsPaper
    # (we'll also just reuse the pairwise tables)

    # clean up bluearc.  Leave WINDOWS dir for now
    rm -r /cluster/bluearc/dm1/multiz.dm1droYak1dp2anoGam1/phastCons/{POSTPROBS,ELEMENTS,wib}


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/dm1/bed/genomicSuperDups
    cd /cluster/data/dm1/bed/genomicSuperDups
    sed -e 's/\t_\t/\t-\t/' dm1_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed dm1 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


