# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the 
# Chicken (Gallus gallus) February 2004 release.


# CREATE BUILD DIRECTORY (DONE 2/23/04 angie)
    ssh kksilo
    mkdir /cluster/store7/galGal2
    ln -s /cluster/store7/galGal2 /cluster/data/galGal2


# DOWNLOAD MITOCHONDRION GENOME SEQUENCE (DONE 2/23/04 angie)
    mkdir /cluster/data/galGal2/M
    cd /cluster/data/galGal2/M
    # go to http://www.ncbi.nih.gov/ and search Nucleotide for 
    # "gallus mitochondrion genome".  That shows the gi number:
    # 5834843 
    # Use that number in the entrez linking interface to get fasta:
    wget -O chrM.fa \
      'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Text&db=Nucleotide&uid=5834843&dopt=FASTA'
    # Edit chrM.fa: make sure the long fancy header line says it's the 
    # Gallus gallus mitochondrion complete genome, and then replace the 
    # header line with just ">chrM".


# DOWNLOAD WHOLE GENOME SHOTGUN CONTIGS & QUAL (DONE 2/23/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2
    # Contig sequences have been stable for a while:
    wget ftp://genome.wustl.edu/private/chicken_040105/Fasta/chicken_040105.contigs.tar.gz
    # Official location (password-protected):
    # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.tar.gz
    mkdir contigs
    cd contigs
    tar xvfz ../chicken_040105.contigs.tar.gz
    foreach f (chicken_031214.pcap.contigs*)
      set n = `echo $f | sed -e 's/chicken_031214.pcap.contigs//'`
      mv $f contigs$n.fa
    end
    faSize *.fa
#1054180845 bases (107580 N's 1054073265 real) in 111864 sequences in 100 files
#Total size: mean 9423.8 sd 19990.9 min 52 (Contig28908.1) max 441791 (Contig132.21) median 2122
#N count: mean 1.0 sd 5.1
    mkdir /cluster/bluearc/galGal2
    cat ../contigs/*.fa > /cluster/bluearc/galGal2/allContigs.fa
    # Get qual scores too
    cd /cluster/data/galGal2
    wget ftp://genome.wustl.edu/private/chicken_040105/Qual/chicken_040105.contigs.qual.tar.gz
    # Official location (password-protected):
    # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_040105.contigs.qual.tar.gz
    

# DOWNLOAD AGP, BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE 2/24/04 angie)
    ssh kolossus
    cd /cluster/data/galGal2
    wget ftp://genome.wustl.edu/private/lhillier/old/chicken_agp_040224.tar.gz
    # anticipated Official location (password-protected):
    # http://genome.wustl.edu/projects/chicken/chicken_analysis/chicken_agp_040224.tar.gz
    tar xvzf chicken_agp_040224.tar.gz
    cd agp_040224
    cp /dev/null ../chrom.lst
    foreach f (*.agp)
      set chr = `echo $f:r | sed -e 's/^chr//'`
      set base = `echo $chr | sed -e 's/_random$//'`
      mkdir -p ../$base
      cp -p $f ../$base
      if ("$chr" == "$base") echo $chr >> ../chrom.lst
    end
    # OK, tack chrM on there too:
    echo M >> ../chrom.lst
    cd /cluster/data/galGal2
    foreach c (`cat chrom.lst`)
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo building $fa from $agp
          agpToFa -simpleMultiMixed $agp $agp:t:r $fa \
            /cluster/bluearc/galGal2/allContigs.fa
        endif
      end
    end
    # checkAgpAndFa prints out way too much info -- keep the end/stderr only:
    foreach c (`cat chrom.lst`)
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo checking consistency of $agp and $fa
          ~/bin/$MACHTYPE/checkAgpAndFa $agp $fa | tail -1
        endif
      end
    end
    faSize */chr*.fa
#1133629576 bases (79539536 N's 1054090040 real) in 54 sequences in 54 files
#Total size: mean 20993140.3 sd 41649023.2 min 1525 (chrE64) max 188239860 (chr1) median 4731479
#N count: mean 1472954.4 sd 5969394.3
    # excluding chrM.fa:  note the same #real as for WGS contigs:
#1133612801 bases (79539536 N's 1054073265 real) in 53 sequences in 53 files
#Total size: mean 21388920.8 sd 41944943.4 min 1525 (chrE64) max 188239860 (chr1) median 4731479
#N count: mean 1500746.0 sd 6022991.0


# COMPARE BASE COUNTS WITH WUSTL'S (DONE 2/27/04 angie)
    # Compare LaDeana's size summary file with our chrom faSize output:
    ssh kksilo
    cd /cluster/data/galGal2
    # Download from password-protected location:
    wget --http-user=xxxxxx --http-passwd=xxxxx \
      -O basepairs_per_chromosome \
      http://genome.wustl.edu/projects/chicken/chicken_analysis/rsc/basepairs_per_chromosome
    # Actually, we can only compare total bases with faSize because faSize 
    # doesn't differentiate between an N from a gap and an N base.  
    perl -wpe   'chop; if (! /[0-9]/ || /TOTALS/) { s/^.*$//; next; } \
                 s/^GGA//; \
                 if (s/^(\w+)\s+(\d+)\s+(\d+)\s*$//) { \
                   ($c, $noGapBP, $totalBP) = ($1, $2, $3); \
                   $c =~ s/Unlocalized/Un/; \
                   $chr = "chr$c"; \
                   $c =~ s/_random//; \
                   $faSize = "faSize $c/$chr.fa"; \
                   $faSize = `$faSize`; \
                   if ($faSize =~ /^(\d+) bases \((\d+) N.s (\d+) real/) { \
                     if ($totalBP != $1) { \
                       print "$chr totalBP: WUSTL $totalBP, us $1!\n"; \
                     } \
                   } else { print "parsed faSize wrong:\n$faSize\n"; } \
                 } else { print "what is this?\n$_\n"; }' \
      basepairs_per_chromosome
    # oops, basepairs_per_chromosome doesn't include terminal gaps in total BP
    # counts... told LaDeana:
#chr12 totalBP: WUSTL 19811895, us 19821895!
#chr23 totalBP: WUSTL 5166127, us 5666127!
#chr28 totalBP: WUSTL 4231479, us 4731479!


# BREAK UP SEQUENCE INTO 5 MB CHUNKS AT CONTIGS/GAPS (DONE 2/24/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2
    foreach c (`cat chrom.lst`)
      foreach agp ($c/chr$c{,_random}.agp)
        if (-e $agp) then
          set fa = $agp:r.fa
          echo splitting $agp and $fa
          cp -p $agp $agp.bak
          cp -p $fa $fa.bak
          splitFaIntoContigs $agp $fa . -nSize=5000000
        endif
      end
    end
    # splitFaIntoContigs makes new dirs for _randoms.  Move their contents 
    # back into the main chrom dirs and get rid of the _random dirs.
    foreach d (*_random)
      set base = `echo $d | sed -e 's/_random$//'`
      mv $d/lift/oOut.lst $base/lift/rOut.lst
      mv $d/lift/ordered.lft $base/lift/random.lft
      mv $d/lift/ordered.lst $base/lift/random.lst
      rmdir $d/lift
      mv $d/* $base
      rmdir $d
    end
    # Make a "pseudo-contig" for processing chrM too:
    mkdir M/chrM_1
    sed -e 's/chrM/chrM_1/' M/chrM.fa > M/chrM_1/chrM_1.fa
    mkdir M/lift
    echo "chrM_1/chrM_1.fa.out" > M/lift/oOut.lst
    echo "chrM_1" > M/lift/ordered.lst
    echo "0	M/chrM_1	16775	chrM	16775" > M/lift/ordered.lft


# MAKE JKSTUFF AND BED DIRECTORIES (DONE 2/23/04 angie)
    # This used to hold scripts -- better to keep them inline here so 
    # they're in CVS.  Now it should just hold lift file(s) and 
    # temporary scripts made by copy-paste from this file.  
    mkdir /cluster/data/galGal2/jkStuff
    # This is where most tracks will be built:
    mkdir /cluster/data/galGal2/bed


# CREATING DATABASE (DONE 2/23/04 angie)
    ssh hgwdev
    echo 'create database galGal2' | hgsql ''
    # Use df to make sure there is at least 75G free on hgwdev:/var/lib/mysql
    df -h /var/lib/mysql
    # /dev/sdc1             1.8T  318G  1.4T  20% /var/lib/mysql


# CREATING GRP TABLE FOR TRACK GROUPING (DONE 2/23/04 angie)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" \
      | hgsql galGal2


# MAKE CHROMINFO TABLE WITH (TEMPORARILY UNMASKED) NIBS (DONE 2/24/04 angie)
    # Make nib/, unmasked until RepeatMasker and TRF steps are done.
    # Do this now so we can load up RepeatMasker and run featureBits; 
    # can also load up other tables that don't depend on masking.  
    ssh kksilo
    cd /cluster/data/galGal2
    mkdir nib
    foreach c (`cat chrom.lst`)
      foreach f ($c/chr${c}{,_random}.fa)
        if (-e $f) then
          echo "nibbing $f"
          /cluster/bin/i386/faToNib $f nib/$f:t:r.nib
        endif
      end
    end

    # Make symbolic links from /gbdb/galGal2/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/galGal2/nib
    foreach f (/cluster/data/galGal2/nib/chr*.nib)
      ln -s $f /gbdb/galGal2/nib
    end
    # Load /gbdb/galGal2/nib paths into database and save size info.
    cd /cluster/data/galGal2
    hgsql galGal2  < $HOME/kent/src/hg/lib/chromInfo.sql
    hgNibSeq -preMadeNib galGal2 /gbdb/galGal2/nib */chr*.fa
    echo "select chrom,size from chromInfo" | hgsql -N galGal2 > chrom.sizes
    # take a look at chrom.sizes, should be 54 lines
    wc chrom.sizes


# REPEAT MASKING (DONE 2/25/04 angie)
    #- Split contigs into 500kb chunks, at gaps if possible:
    ssh kksilo
    cd /cluster/data/galGal2
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}*_?{,?})
        cd $d
        echo "splitting $d"
        set contig = $d:t
        ~/bin/i386/faSplit gap $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -minGapSize=100
        cd ../..
      end
    end

    #- Make the run directory and job list:
    cd /cluster/data/galGal2
    cat << '_EOF_' > jkStuff/RMChicken
#!/bin/csh -fe

cd $1
pushd .
/bin/mkdir -p /tmp/galGal2/$2
/bin/cp $2 /tmp/galGal2/$2/
cd /tmp/galGal2/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -ali -s -spec chicken $2
popd
/bin/cp /tmp/galGal2/$2/$2.out ./
if (-e /tmp/galGal2/$2/$2.align) /bin/cp /tmp/galGal2/$2/$2.align ./
if (-e /tmp/galGal2/$2/$2.tbl) /bin/cp /tmp/galGal2/$2/$2.tbl ./
if (-e /tmp/galGal2/$2/$2.cat) /bin/cp /tmp/galGal2/$2/$2.cat ./
/bin/rm -fr /tmp/galGal2/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal2/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/galGal2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMChicken
    mkdir RMRun
    cp /dev/null RMRun/RMJobs
    foreach c (`cat chrom.lst`)
      foreach d ($c/chr${c}{,_random}_?{,?})
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/data/galGal2/jkStuff/RMChicken \
                 /cluster/data/galGal2/$d $f \
               '{'check out line+ /cluster/data/galGal2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
      end
    end

    #- Do the run
    ssh kk
    cd /cluster/data/galGal2/RMRun
    para create RMJobs
    para try, para check, para check, para push, para check,...
#Completed: 2724 of 2724 jobs
#Average job time:                2033s      33.89m     0.56h    0.02d
#Longest job:                     9752s     162.53m     2.71h    0.11d
#Submission to last job:          9771s     162.85m     2.71h    0.11d

    #- Lift up the 500KB chunk .out's to 5MB ("pseudo-contig") level
    ssh kksilo
    cd /cluster/data/galGal2
    foreach d (*/chr*_?{,?})
      set contig = $d:t
      echo $contig
      liftUp $d/$contig.fa.out $d/$contig.lft warn $d/${contig}_*.fa.out \
        > /dev/null
    end

    #- Lift pseudo-contigs to chromosome level
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.fa.out lift/ordered.lft warn `cat lift/oOut.lst` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.fa.out lift/random.lft warn `cat lift/rOut.lst` \
        > /dev/null
      endif
      cd ..
    end

    #- Load the .out files into the database with:
    ssh hgwdev
    cd /cluster/data/galGal2
    hgLoadOut galGal2 */chr*.fa.out


# RE-RUN PROCESSREPEATS (DONE 2/25/04 angie)
    # Sure hope this is the only time we ever need to do this:
    # Arian sent an update to the ProcessRepeats script -- we can re-run 
    # from .cat files, should go pretty quickly, and Arian says it shouldn't 
    # affect our masking that much.
    ssh kksilo
    cd /cluster/data/galGal2
    foreach d (*/chr*_?{,?})
      echo $d
      rm -rf /tmp/gawdammit
      mkdir /tmp/gawdammit
      cp -p $d/*.cat /tmp/gawdammit
      pushd /tmp/gawdammit
      /cluster/home/angie/cb/RepeatMasker/ProcessRepeats -spec gallus *.cat \
      > /dev/null
      popd
      liftUp $d/$d:t.new.out $d/$d:t.lft warn /tmp/gawdammit/*.out > /dev/null
    end
    # lift *.new.out to chrom-level new.out
    foreach c (`cat chrom.lst`)
      echo lifting $c
      cd $c
      if (-e lift/ordered.lft && ! -z lift/ordered.lft) then
        liftUp chr$c.new.out lift/ordered.lft warn \
          `cat lift/oOut.lst | sed -e 's/.fa.out/.new.out/'` \
        > /dev/null
      endif
      if (-e lift/random.lft && ! -z lift/random.lft) then
        liftUp chr${c}_random.new.out lift/random.lft warn \
          `cat lift/rOut.lst | sed -e 's/.fa.out/.new.out/'` \
        > /dev/null
      endif
      cd ..
    end
    # Compare old .fa.out to new (reloading chr*_rmsk in the process)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/ProcessRepeatsCheck
    cd /cluster/data/galGal2/bed/ProcessRepeatsCheck
    featureBits galGal2 rmsk -bed=rmsk.orig.bed
#104249252 bases of 1054197620 (9.889%) in intersection
    hgLoadOut galGal2 */chr*.new.out
    featureBits galGal2 rmsk -bed=rmsk.new.bed
#104249260 bases of 1054197620 (9.889%) in intersection
    featureBits galGal2 rmsk \!rmsk.orig.bed -bed=notIn.orig.bed
#8 bases of 1054197620 (0.000%) in intersection
    featureBits galGal2 \!rmsk rmsk.orig.bed -bed=notIn.new.bed
#0 bases of 1054197620 (0.000%) in intersection
    # SWEET!
    cat notIn.orig.bed 
#chr1    141589449       141589454       chr1.1
#chr2    110897485       110897488       chr2.1
    # In chr1, looks like these two original lines...
# 1014  31.8  6.7  1.7  chr1      141588904 141589449 (46650411) +  CR1-Y4         LINE/CR1               281 4041  (472)    176
# 1099  27.8  6.2  1.6  chr1      141589455 141589838 (46650022) +  CR1-Y4         LINE/CR1               806 1207    (7)    177
    # ... got merged:
# 1099  30.1  6.5  1.7  chr1      141588904 141589838 (46650022) +  CR1-Y4         LINE/CR1              3454 4508    (7)    176
    # In chr2, 
# 1305  23.6  2.1  1.2  chr2      110897148 110897485 (36693280) C  CR1-Y4         LINE/CR1              (14) 1200    860     96
#  335  20.4  0.9  8.8  chr2      110897489 110897601 (36693164) C  CR1-Y          LINE/CR1             (423) 4090   3987     97
    # get merged to
# 1305  22.8  1.8  3.3  chr2      110897148 110897601 (36693164) C  CR1-Y4         LINE/CR1              (14) 4501   3987     

    # Replace old .fa.outs with .new.outs
    foreach f ( */chr*.new.out */*/chr*_?{,?}.new.out )
      set g = `echo $f | sed -e 's/.new.out/.fa.out/'`
      mv $f $g
    end
    # Remove lowest-level old .fa.out's -- and .fa's while at it
    foreach d (*/chr*_?{,?})
      rm $d/${d:t}_?{,?}.fa.out
      rm $d/${d:t}_?{,?}.fa
    end


# VERIFY REPEATMASKER RESULTS (DONE 2/26/04 angie)
    # Eyeball some repeat annotations in the browser, compare to lib seqs.
    # Run featureBits on galGal2 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits galGal2 rmsk
#104249260 bases of 1054197620 (9.889%) in intersection
    # slight increase from galGal1:
#103553237 bases of 1054197620 (9.823%) in intersection


# MAKE 10.OOC, 11.OOC FILES FOR BLAT (DONE 2/27/04 angie)
    # Use -repMatch=380 (based on size -- for human we use 1024, and 
    # chicken size is ~37% of human)
    ssh kkr1u00
    mkdir /cluster/data/galGal2/bed/ooc
    cd /cluster/data/galGal2/bed/ooc
    ls -1 /cluster/data/galGal2/nib/chr*.nib > nib.lst
    blat nib.lst /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/bluearc/galGal2/11.ooc -repMatch=380
#Wrote 13552 overused 11-mers to /cluster/bluearc/galGal2/11.ooc
    blat nib.lst /dev/null /dev/null -tileSize=10 \
      -makeOoc=/cluster/bluearc/galGal2/10.ooc -repMatch=380
#Wrote 169709 overused 10-mers to /cluster/bluearc/galGal2/10.ooc
    cp -p /cluster/bluearc/galGal2/*.ooc /iscratch/i/galGal2/
    iSync


# MAKE LIFTALL.LFT (DONE 2/24/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2
    cat */lift/{ordered,random}.lft > jkStuff/liftAll.lft


# PUT UNMASKED CHUNKS ON BLUEARC (DONE 2/24/04 angie)
    # Some of Terry's staging scripts require a flat dir containing 
    # all contigs; also, doesn't hurt to set aside unmasked seq.
    ssh kksilo
    cd /cluster/data/galGal2
    mkdir /cluster/bluearc/galGal2/unmaskedChunks
    foreach d (*/chr*_?{,?})
      cp -p $d/${d:t}.fa /cluster/bluearc/galGal2/unmaskedChunks
    end


# MAKE STS MARKERS TRACK (TODO 3//04 angie -- need more info from Terry)
    set base = /cluster/data/galGal2/bed/markers
    mkdir -p $base/primers $base/sts
    # Need to find out how Terry built these:
    cp -p /cluster/bluearc/gg1/markers/sts/convert.pl $base
    cp -p /cluster/bluearc/gg1/markers/stsAlias.bed $base
    cp -p /cluster/bluearc/gg1/markers/stsInfoGG.bed $base

    # Need to find out where Terry got these files:
    cp -p /cluster/bluearc/gg1/markers/primers/all.primers{,.fa} $base/primers
    cp -p /cluster/bluearc/gg1/markers/sts/all.sequence.fa $base/sts

    # Run e-PCR on primers
    mkdir $base/primers/epcr
    cd $base/primers/epcr
    mkdir epcr.out
    /cluster/bin/scripts/createCloneList \
      /cluster/bluearc/galGal2/unmaskedChunks
    # That script creates in.lst/$n.in files, where $n is 
    # the contigs number *plus 1*!  in.lst/0.in is empty and makes 
    # para create complain, so get rid of it.  
    rm in.lst/0.in
    egrep -v '/0.in$' files.lst > tmp.lst; mv tmp.lst files.lst
    echo $base/primers/all.primers > epcr.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/scripts/runEpcr {check in line+ $(path1)} {check in line+ $(path2)} {check out exists epcr.out/$(root2).epcr}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    /cluster/bin/i386/gensub2 epcr.lst files.lst template jobList
    ssh kk
    set base = /cluster/data/galGal2/bed/markers
    cd $base/primers/epcr
    para create jobList
    para try, check, push, check, ...
#Completed: 251 of 251 jobs
#Average job time:                  30s       0.50m     0.01h    0.00d
#Longest job:                       45s       0.75m     0.01h    0.00d
#Submission to last job:            46s       0.77m     0.01h    0.00d
    cat `ls -1 epcr.out/* | sort -g` > all.epcr

    # blat primers -- with old blat.2 !
    mkdir $base/primers/blat
    cd $base/primers/blat
    mkdir primers.out
    ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst
    echo $base/primers/all.primers.fa > primers.lst
    cat << '_EOF_' > template
#LOOP
/cluster/home/kent/bin/i386/blat.2 -tileSize=10 -ooc=/cluster/bluearc/galGal2/10.ooc -minMatch=1 -minScore=1 -minIdentity=75 {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ primers.out/$(root1).$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    /cluster/bin/i386/gensub2 contigs.lst primers.lst template jobList
    ssh kk
    set base = /cluster/data/galGal2/bed/markers
    cd $base/primers/blat
    para create jobList
    para try, check, push, check, ...
#Completed: 261 of 261 jobs
#Average job time:                  18s       0.30m     0.00h    0.00d
#Longest job:                       27s       0.45m     0.01h    0.00d
#Submission to last job:            28s       0.47m     0.01h    0.00d
    /cluster/bin/i386/pslSort dirs primers.psl temp primers.out
    rm -r temp

    # Make primers.final from e-PCR and blat results:
    cd $base/primers
    filterPrimers -chicken ../stsInfoGG.bed blat/primers.psl all.primers \
      epcr/all.epcr > primers.psl.filter
    extractPslInfo primers.psl.filter
    mv primers.psl.filter.initial primers.initial
    sort -k 4n primers.initial > primers.final

    # blat sts sequences
    mkdir $base/sts/blat
    cd $base/sts/blat
    mkdir stsMarkers.out
    ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst
    echo $base/sts/all.sequence.fa > stsMarkers.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ stsMarkers.out/$(root1).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    /cluster/bin/i386/gensub2 contigs.lst stsMarkers.lst template jobList
    ssh kk
    set base = /cluster/data/galGal2/bed/markers
    cd $base/sts/blat
    para create jobList
    para try, check, push, check, ...
#Completed: 261 of 261 jobs
#Average job time:                  96s       1.59m     0.03h    0.00d
#Longest job:                      199s       3.32m     0.06h    0.00d
#Submission to last job:           199s       3.32m     0.06h    0.00d
    /cluster/bin/i386/pslSort dirs raw.psl temp stsMarkers.out
    /cluster/bin/i386/pslReps -nearTop=0.01 -minCover=0.6 -minAli=0.8 \
      -noIntrons raw.psl stsMarkers.psl /dev/null
    rm -rf temp
    rm -f raw.psl

    cd $base/sts
    filterPsl blat/stsMarkers.psl
    extractUniqPsl stsMarkers.psl.filter stsMarkers.psl.filter
    pslReps stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.1 /dev/null \
      -noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05
    mv stsMarkers.psl.filter stsMarkers.psl.filter.orig
    cat stsMarkers.psl.filter.1 stsMarkers.psl.filter.uniq.psl \
      > stsMarkers.psl.filter
    extractPslInfo -h stsMarkers.psl.filter
    rm stsMarkers.psl.filter.1*
    rm stsMarkers.psl.filter.dup.psl stsMarkers.psl.filter.names \
      stsMarkers.psl.filter.uniq.psl
    extractPslInfo -h stsMarkers.psl.filter
    mv stsMarkers.psl.filter.initial stsMarkers.initial
    sort -k 4n stsMarkers.initial > stsMarkers.final.orig
    ../convert.pl ../stsAlias.bed stsMarkers.final.orig > stsMarkers.final

    cd $base
    combineSeqPrimerPos $base/sts/stsMarkers.final \
      $base/primers/primers.final > stsMarkers_pos.rdb
    # This step needs work -- output is 0-length
    createSTSbed stsInfoGG.bed stsMarkers_pos.rdb > stsMap.bed

    ssh hgwdev
    cd /cluster/data/galGal2/bed/markers
    # Make an stsMarkersTmp so we can at least see the locations
    awk '{printf "%s\t%s\t%s\t%s\t%d\n", $1, $2, $3, $4, $5 * 1000};' \
      sts/stsMarkers.final.orig \
    | liftUp -type=.bed stsMarkersTmp.bed ../../jkStuff/liftAll.lft warn stdin
    hgLoadBed galGal2 stsMarkerTmp stsMarkersTmp.bed 
    # lift & load all_sts_primer
    liftUp primers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \
      primers/primers.psl.filter
    hgLoadPsl -table=all_sts_primer -fastLoad \
      galGal2 primers.filter.lifted.psl
    # lift & load all_sts_seq
    liftUp stsMarkers.filter.lifted.psl ../../jkStuff/liftAll.lft warn \
      sts/stsMarkers.psl.filter
    hgLoadPsl -table=all_sts_seq -fastLoad \
      galGal2 stsMarkers.filter.lifted.psl
    # load sequences
    mkdir /gbdb/galGal2/sts
    ln -s /cluster/data/galGal2/bed/markers/sts/all.sequence.fa \
      /gbdb/galGal2/sts/
    ln -s /cluster/data/galGal2/bed/markers/primers/all.primers.fa \
      /gbdb/galGal2/sts/
    hgLoadSeq galGal2 /gbdb/galGal2/sts/*.fa


# BACENDS (DONE 2/24/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/bacends
    cd /cluster/data/galGal2/bed/bacends
    # download
    wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/AllBACends.mfa.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/BACENDS/gallus_gallus/cl_acc_gi_len.gz
    gunzip *.gz
    /cluster/bin/scripts/convertBacEndPairInfo cl_acc_gi_len >& pairInfo.log
    perl -wpe 's/^>gi\|\d+\|gb\|(\w+)\.\d+\|.*/>$1/' AllBACends.mfa \
      > bacends.fa
    faSize bacends.fa
#146921771 bases (2724913 N's 144196858 real) in 135555 sequences in 1 files
#Total size: mean 1083.9 sd 184.7 min 64 (CC322877.1) max 2431 (CC263290.1) median 1095
#N count: mean 20.1 sd 73.1
    rm AllBACends.mfa
    mkdir /cluster/bluearc/galGal2/bacends
    faSplit sequence bacends.fa 20 /cluster/bluearc/galGal2/bacends/bacends
    ls -1S /cluster/bluearc/galGal2/bacends/*.fa > bacends.lst
    ls -1S /cluster/bluearc/galGal2/unmaskedChunks/*.fa > contigs.lst
    cat << '_EOF_' > template
#LOOP
/cluster/bin/i386/blat $(path1) $(path2) -tileSize=10 -ooc=/cluster/bluearc/galGal2/10.ooc {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    mkdir psl
    gensub2 contigs.lst bacends.lst template jobList
    ssh kk
    cd /cluster/data/galGal2/bed/bacends
    para create jobList
    para try, check, push, check, ...
#Completed: 5180 of 5180 jobs
#Average job time:                  97s       1.62m     0.03h    0.00d
#Longest job:                     5256s      87.60m     1.46h    0.06d
#Submission to last job:          5333s      88.88m     1.48h    0.06d
    # back on kksilo, filter and lift results:
    pslSort dirs raw.psl temp psl
    pslReps -nearTop=0.02 -minCover=0.60 -minAli=0.85 -noIntrons raw.psl \
      bacEnds.psl /dev/null
    liftUp bacEnds.lifted.psl ../../jkStuff/liftAll.lft warn bacEnds.psl
    rm -r temp
    rm raw.psl

    extractUniqPsl bacEnds.lifted.psl bacEnds.psl.filter
    pslReps bacEnds.psl.filter.dup.psl bacEnds.psl.filter.1 /dev/null \
      -noIntrons -minCover=0.80 -minAli=0.90 -nearTop=0.05
    extractUniqPsl bacEnds.psl.filter.1 bacEnds.psl.filter.1
    rm bacEnds.psl.filter.1
    pslReps bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.2 /dev/null \
      -noIntrons -minCover=0.90 -minAli=0.95 -nearTop=0.01
    extractUniqPsl bacEnds.psl.filter.2 bacEnds.psl.filter.2
    rm bacEnds.psl.filter.2
    pslReps bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.3 /dev/null \
      -noIntrons -minCover=0.95 -minAli=0.98 -nearTop=0.01
    cat bacEnds.psl.filter.3 \
        bacEnds.psl.filter.uniq.psl \
        bacEnds.psl.filter.1.uniq.psl \
        bacEnds.psl.filter.2.uniq.psl \
    > bacEnds.psl.filter.lifted
    rm *.names bacEnds.psl.filter.3 bacEnds.psl.filter.uniq.psl \
       bacEnds.psl.filter.1.uniq.psl bacEnds.psl.filter.2.uniq.psl

    extractPslInfo -h -be bacEnds.psl.filter.lifted
    mv bacEnds.psl.filter.lifted.initial bacEnds.initial
    findAccession -agp -chicken bacEnds.initial /cluster/data/galGal2
    rm bacEnds.initial
    sort -k 4 bacEnds.initial.acc > bacEnds.sort
    compileBacEnds bacEnds.sort bacEndPairs.txt bacEndSingles.txt \
      > bacEnds.temp
    sorttbl chrom chromStart < bacEnds.temp > bacEnds.rdb
    rm bacEnds.initial.acc bacEnds.sort bacEnds.temp

    createBACpairs bacEnds.rdb bacEndPairs.txt > temp.rdb
    sorttbl chrom1 -r ord1 chromStart1 < temp.rdb > bacEnds.pairs.rdb
    rm temp.rdb
    createBACpairs -bed bacEnds.rdb bacEndPairs.txt > bacEndPairs.rdb
    sorttbl chrom chromStart < bacEndPairs.rdb \
    | headchg -del > bacEndPairs.bed    
    rm bacEndPairs.rdb
    createBacPairsBad -bed bacEnds.rdb bacEndPairs.txt > bacEndPairsBad.rdb
    sorttbl chrom chromStart < bacEndPairsBad.rdb \
    | headchg -del > bacEndPairsBad.bed    
    rm bacEndPairsBad.rdb
    sorttbl chrom chromStart < bacEndPairsLong.rdb \
    | headchg -del > bacEndPairsLong.bed
    rm bacEndPairsLong.rdb
    findMissingEnd -max 500000 singleBac.txt bacEnds.psl.filter.2.dup.psl \
      bacEnds.psl.filter.1.dup.psl bacEnds.psl.filter.dup.psl > bacEndsMiss.psl
    mv bacEnds.psl.filter.lifted bacEnds.psl.filter.lifted.orig
    cat bacEnds.psl.filter.lifted.orig bacEndsMiss.psl \
      > bacEnds.psl.filter.lifted
    rm bacEnds.psl.filter.2.dup.psl bacEnds.psl.filter.1.dup.psl \
       bacEnds.psl.filter.dup.psl
    extractPslLoad bacEnds.psl.filter.lifted bacEndPairs.bed \
      bacEndPairsBad.bed bacEndPairsLong.bed > bacEnds.psl.rdb
    sorttbl tname tstart < bacEnds.psl.rdb | headchg -del > bacEnds.load.psl
    rm bacEnds.psl.rdb

    # Load seq's and alignments
    ssh hgwdev
    cd /cluster/data/galGal2/bed/bacends
    mkdir /gbdb/galGal2/bacends
    ln -s /cluster/data/galGal2/bed/bacends/bacends.fa /gbdb/galGal2/bacends
    hgLoadSeq galGal2 /gbdb/galGal2/bacends/bacends.fa
    hgLoadPsl -table=all_bacends -fastLoad \
      galGal2 /cluster/data/galGal2/bed/bacends/bacEnds.load.psl
    # Don't know what caused the warning... but all seem to have loaded.
#load of all_bacends did not go as planned: 76421 record(s), 0 row(s) skipped, 1 warning(s) loading psl.tab
    hgLoadBed -sqlTable=$HOME/kent/src/hg/lib/bacEndPairs.sql \
      -hasBin galGal2 bacEndPairs bacEndPairs.bed

    sed -e 's/bacEndPairs/bacEndPairsBad/g' \
      $HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsBad.sql
    ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \
      -hasBin galGal2 bacEndPairsBad bacEndPairsBad.bed
    # Doh!  6 lines of that have negative starts.  Well, trim those 
    # for now and load the rest, then go back and find the neg-start 
    # cause...
    egrep -v '	-[0-9]+' bacEndPairsBad.bed > bacEndPairsBadFudge.bed
    ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsBad.sql \
      -hasBin galGal2 bacEndPairsBad bacEndPairsBadFudge.bed

    sed -e 's/bacEndPairs/bacEndPairsLong/g' \
      $HOME/kent/src/hg/lib/bacEndPairs.sql > bacEndPairsLong.sql
    ~/bin/i386/hgLoadBed -sqlTable=bacEndPairsLong.sql \
      -hasBin galGal2 bacEndPairsLong bacEndPairsLong.bed


# SIMPLE REPEATS (TRF) (DONE 2/24/04 angie)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/simpleRepeat
    cd /cluster/data/galGal2/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    foreach d (/cluster/data/galGal2/*/chr*_?{,?})
      set ctg = $d:t
      foreach f ($d/${ctg}.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
      end
    end
    csh -ef jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    endsInLf trf/*
    # When job is done do:
    liftUp simpleRepeat.bed /cluster/data/galGal2/jkStuff/liftAll.lft warn \
      trf/*.bed

    # Load into the database:
    ssh hgwdev
    hgLoadBed galGal2 simpleRepeat \
      /cluster/data/galGal2/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql
    featureBits galGal2 simpleRepeat
    # 8434365 bases of 1054197620 (0.800%) in intersection
    # Wow, identical to galGal1!  Which would really freak me out except that 
    # the underlying WGS contig sequences have not changed at all, only their 
    # relative ordering+orientation along chroms.  


# PROCESS SIMPLE REPEATS INTO MASK (DONE 2/24/04/ angie)
    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh kksilo
    cd /cluster/data/galGal2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd /cluster/data/galGal2
    mkdir bed/simpleRepeat/trfMaskChrom
    foreach c (`cat chrom.lst`)
      if (-e $c/lift/ordered.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
          $c/lift/ordered.lst > $c/lift/oTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      endif
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end
    # Here's the coverage for the filtered TRF:
    ssh hgwdev
    cat /cluster/data/galGal2/bed/simpleRepeat/trfMaskChrom/*.bed \
      > /tmp/filtTrf.bed
    featureBits galGal2 /tmp/filtTrf.bed
    # 4510381 bases of 1054197620 (0.428%) in intersection
    # OK, not *identical* to galGal1 (4512560) but very close.  Maybe 
    # there's a difference in the period chosen for some range of bases??  


# MASK SEQUENCE WITH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 2/25/04 angie)
    # Note: just to keep things consistent, redid chr1 and chr2 2/26 with 
    # the ProcessRepeats-only rerun results (only masking changes were 
    # 5bp in chr1 and 3bp in chr2)
    ssh kksilo
    cd /cluster/data/galGal2
    # Soft-mask (lower-case) the contig and chr .fa's, 
    # then make hard-masked versions from the soft-masked.  
    set trfCtg=bed/simpleRepeat/trfMask
    set trfChr=bed/simpleRepeat/trfMaskChrom
    foreach f (*/chr*.fa)
      echo "repeat- and trf-masking $f"
      maskOutFa -soft $f $f.out $f
      set chr = $f:t:r
      maskOutFa -softAdd $f $trfChr/$chr.bed $f
      echo "hard-masking $f"
      maskOutFa $f hard $f.masked
    end
# This warning is extremely rare -- if it indicates a problem, it's only 
# with the repeat annotation and doesn't affect our masking.
#WARNING: negative rEnd: -25 chr2:94907561-94907817 GGLTR3B1
# sent test case to Arian.  (line 101 of chr2_19_19.fa.out)
    foreach c (`cat chrom.lst`)
      echo "repeat- and trf-masking contigs of chr$c, chr${c}_random"
      foreach d ($c/chr*_?{,?})
        set ctg=$d:t
        set f=$d/$ctg.fa
        maskOutFa -soft $f $f.out $f
        maskOutFa -softAdd $f $trfCtg/$ctg.bed $f
        maskOutFa $f hard $f.masked
      end
    end
# same deal here:
#WARNING: negative rEnd: -25 chr2_19:4344750-4345006 GGLTR3B1
    #- Rebuild the nib files, using the soft masking in the fa:
    mkdir nib
    foreach f (*/chr*.fa)
      faToNib -softMask $f nib/$f:t:r.nib
    end
    # Make one big 2bit file as well, and make a link to it in 
    # /gbdb/galGal2/nib because hgBlat looks there:
    faToTwoBit */chr*.fa galGal2.2bit
    ln -s /cluster/data/galGal2/galGal2.2bit /gbdb/galGal2/nib/


# GOLD AND GAP TRACKS (DONE 2/24/04 angie)
    ssh hgwdev
    cd /cluster/data/galGal2
    hgGoldGapGl -noGl -chromLst=chrom.lst galGal2 /cluster/data/galGal2 .
    # featureBits fails if there's no chrM_gap, so make one:
    # echo "create table chrM_gap like chr1_gap" | hgsql galGal2
    # oops, that won't work until v4.1, so do this for the time being:
    echo "create table chrM_gap select * from chr1_gap where 0=1" \
    | hgsql galGal2


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR GALGAL1 (DONE 2/24/04 angie)
    ssh hgwdev
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd $HOME/kent/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add galGal2 in all the right places and do
    make update
    make alpha
    cvs commit makefile

    # Add dbDb and defaultDb entries:
    echo 'insert into dbDb (name, description, nibPath, organism,  \
          defaultPos, active, orderKey, genome, scientificName,  \
          htmlPath, hgNearOk)  \
          values("galGal2", "Feb. 2004", \
          "/gbdb/galGal2/nib", "Chicken", "chr13:13120186-13124117", 1, \
          35, "Chicken", "Gallus gallus", \
          "/gbdb/galGal2/html/description.html", 0);' \
    | hgsql -h genome-testdb hgcentraltest
    echo 'update defaultDb set name = "galGal2" where genome = "Chicken"' \
        | hgsql -h genome-testdb hgcentraltest


# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR GALGAL1 (DONE 2/26/04 angie)
    ssh hgwdev
    echo 'insert into blatServers values("galGal2", "blat1", "17778", "1"); \
          insert into blatServers values("galGal2", "blat1", "17779", "0");' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE DESCRIPTION/SAMPLE POSITION HTML PAGE (DONE 2/23/04 angie)
    ssh hgwdev
    mkdir /gbdb/galGal2/html
    # Write ~/kent/src/hg/makeDb/trackDb/chicken/galGal2/description.html 
    # with a description of the assembly and some sample position queries.  
    chmod a+r $HOME/kent/src/hg/makeDb/trackDb/chicken/galGal2/description.html
    # Check it in and copy (ideally using "make alpha" in trackDb) to 
    # /gbdb/galGal2/html


# MAKE DOWNLOADABLE SEQUENCE FILES (DONE 2/25/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2
    #- Build the .zip files
    cat << '_EOF_' > jkStuff/zipAll.csh
rm -rf zip
mkdir zip
zip -j zip/chromAgp.zip [0-9A-Z]*/chr*.agp
zip -j zip/chromOut.zip */chr*.fa.out
zip -j zip/chromFa.zip */chr*.fa
zip -j zip/chromFaMasked.zip */chr*.fa.masked
cd bed/simpleRepeat
zip ../../zip/chromTrf.zip trfMaskChrom/chr*.bed
cd ../..
cd /cluster/data/genbank
./bin/i386/gbGetSeqs -db=galGal2 -native GenBank mrna \
        /cluster/data/galGal2/zip/mrna.fa
cd /cluster/data/galGal2/zip
zip -j mrna.zip mrna.fa
'_EOF_'
    # << this line makes emacs coloring happy
    csh ./jkStuff/zipAll.csh |& tee zipAll.log
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd /cluster/data/galGal2/zip
    set gp = /usr/local/apache/htdocs/goldenPath/galGal2
    mkdir -p $gp/bigZips
    cp -p *.zip $gp/bigZips
    mkdir -p $gp/chromosomes
    foreach f ( ../*/chr*.fa )
      zip -j $gp/chromosomes/$f:t.zip $f
    end

    cd $gp/bigZips
    md5sum *.zip > md5sum.txt
    cd $gp/chromosomes
    md5sum *.zip > md5sum.txt
    # Take a look at bigZips/* and chromosomes/*, update their README.txt's
    # Can't make refGene upstream sequence files - no refSeq for chicken.
    # Maybe ensGene when we get that.


# PUT MASKED SEQUENCE OUT FOR CLUSTER RUNS (DONE 2/26/04 angie)
    ssh kkr1u00
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /iscratch/i/galGal2/nib
    mkdir /iscratch/i/galGal2/nib
    cp -p /cluster/data/galGal2/nib/chr*.nib /iscratch/i/galGal2/nib
    # Pseudo-contig fa that have been repeat- and trf-masked:
    rm -rf /iscratch/i/galGal2/trfFa
    mkdir /iscratch/i/galGal2/trfFa
    foreach d (/cluster/data/galGal2/*/chr*_?{,?})
      cp $d/$d:t.fa /iscratch/i/galGal2/trfFa
    end
    cp -p /cluster/data/galGal2/galGal2.2bit /iscratch/i/galGal2/
    iSync


# MAKE CHICKEN LINEAGE-SPECIFIC REPEATS (DONE 2/25/04 angie)
    # In an email 2/13/04, Arian said we could treat all human repeats as 
    # lineage-specific, but could exclude these from chicken as ancestral:
    # L3, L3a, L3b, MIR, MIR3, MIRb, MIRm
    ssh kkr1u00
    cd /cluster/data/galGal2
    mkdir -p /iscratch/i/galGal2/linSpecRep
    foreach f (*/chr*.fa.out)
      awk '$10 !~/^(L3|L3a|L3b|MIR|MIR3|MIRb|MIRm)$/ {print;}' $f \
      > /iscratch/i/galGal2/linSpecRep/$f:t:r:r.out.spec
    end


# SWAP BLASTZ HUMAN-CHICKEN TO CHICKEN-HUMAN (HG17) (DONE 8/20/04 angie)
    ssh kolossus
    # hg17-galGal2 alignments were already axtRescored, no need to re-rescore.
    mkdir /cluster/data/galGal2/bed/blastz.hg17.swap.2004-06-14
    ln -s blastz.hg17.swap.2004-06-14 /cluster/data/galGal2/bed/blastz.hg17
    cd /cluster/data/galGal2/bed/blastz.hg17
    set aliDir = /cluster/data/hg17/bed/blastz.galGal2.2004-06-14
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
#1.4G    /cluster/data/hg17/bed/blastz.galGal2.2004-06-14/axtChrom
#1.4G    unsorted
#1.4G    axtChrom
    rm -r unsorted


# CHAIN HUMAN BLASTZ (DONE 8/20/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/galGal2/bed/blastz.hg17
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/galGal2/bed/blastz.hg17/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
           -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
           -minScore=5000 $1 \
  /iscratch/i/galGal2/nib \
  /iscratch/i/gs.18/build35/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 50 of 50 jobs
#Average job time:                  69s       1.15m     0.02h    0.00d
#Longest job:                      223s       3.72m     0.06h    0.00d
#Submission to last job:           229s       3.82m     0.06h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain galGal2 ${c}_chainHg17 $i
    end


# NET HUMAN BLASTZ (DONE 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    netClass noClass.net galGal2 hg17 human.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn human.net > humanSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    netFilter -minGap=10 human.net |  hgLoadNet galGal2 netHg17 stdin
    netFilter -minGap=10 humanSyn.net | h
gLoadNet galGal2 netSyntenyHg17 stdin
    # Add entries for chainHg17, netHg17, netSyntenyHg17 to chicken/galGal2 trackDb


# GENERATE HG17 MAF FOR MULTIZ FROM NET (DONE 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    netSplit human.net net
    cd /cluster/data/galGal2/bed/blastz.hg17
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/galGal2/nib \
        /cluster/data/hg17/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/galGal2/chrom.sizes /cluster/data/hg17/chrom.sizes \
            $maf -tPrefix=galGal2. -qPrefix=hg17.
    end

# XENOPUS BLASTZ/CHAIN/NET (DONE 9/24/04 jk)
# see makeXenTro1.doc and search for zb.galGal2
# The results of this are also symlinked under galGal2/bed

# MAKE VSHG17 DOWNLOADABLES (DONE 5/2/05 angie)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsHg17
    cd /usr/local/apache/htdocs/goldenPath/galGal2/vsHg17
    ln -s /cluster/data/galGal2/bed/blastz.hg17/axtChain/galGal2.hg17.*.gz .
    mkdir axtNet
    foreach f (/cluster/data/galGal2/bed/blastz.hg17/axtNet/*.axt.gz)
      ln -s $f axtNet/$f:t:r:r.galGal2.hg17.net.axt.gz
    end
    nice md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# BLASTZ SELF (DONE 3/3/04 angie)
    ssh kk
    mkdir -p /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    # OK, the 500k x whole chrom 03-02 run was taking FOREVER (18hrs, ~63%)
    # and multiple folks are asking me to dig up pileups for Arian.  So, 
    # just do a vanilla run as a baseline; return to the experiments later.
    cat << '_EOF_' > DEF
# chicken vs. chicken
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
BLASTZ_ABRIDGE_REPEATS=0

# TARGET
# Chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_RMSK=
SEQ1_FLAG=
SEQ1_SMSK=
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY
# Chicken
SEQ2_DIR=/iscratch/i/galGal2/nib
SEQ2_RMSK=
SEQ2_FLAG=
SEQ2_SMSK=
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=10000

BASE=/cluster/data/galGal2/bed/blastz.galGal2.2004-03-03

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line makes emacs coloring happy

    # Save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.galGal2-galGal2

    # source the DEF file
    bash
    . ./DEF
    mkdir -p $BASE/run
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
    sh $BASE/xdir.sh
    cd $BASE/run
    # now edit j to prefix path to executable name
    sed -e 's#^#/cluster/bin/penn/#' j > j2
    wc -l j*
    head j2
    # make sure the j2 edits are OK, then use it:
    mv j2 j
    para create j
    para try, check, push, check, ...
#Completed: 22801 of 22801 jobs
#Average job time:                 257s       4.29m     0.07h    0.00d
#Longest job:                     1238s      20.63m     0.34h    0.01d
#Submission to last job:          6939s     115.65m     1.93h    0.08d

    # --- normalize (PennSpeak for lift)
    ssh kki
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    # run bash shell if not running it already
    source DEF
    mkdir -p $BASE/run.1
    mkdir -p $BASE/lav
    # create a new job list to convert out files to lav
    /cluster/bin/scripts/blastz-make-out2lav $DEF $BASE \
                        > run.1/jobList
    cd run.1
    wc -l jobList
    head jobList
    # make sure the job list is OK
    para create jobList
    para push
#Completed: 151 of 151 jobs
#Average job time:                  24s       0.41m     0.01h    0.00d
#Longest job:                      127s       2.12m     0.04h    0.00d
#Submission to last job:           246s       4.10m     0.07h    0.00d

    # convert lav files to axt
    ssh kki
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    mkdir axtChrom
    # a new run directory
    mkdir run.2
    cd run.2
# lavToAxt has a new -dropSelf option that *will* replace axtDropOverlap,
# once /cluster/bin/i386/lavToAxt is rebuilt...  Meanwhile it's built 
# for x86_64.
    cat << '_EOF_' > do.csh
#!/bin/csh
cd $1
cat `ls -1 *.lav | sort -g` \
| lavToAxt -dropSelf stdin /iscratch/i/galGal2/nib /iscratch/i/galGal2/nib \
    stdout \
| axtSort stdin $2
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x do.csh
    cat << '_EOF_' > gsub
#LOOP
./do.csh {check in exists $(path1)} {check out line+ /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChrom/$(root1).axt}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    \ls -1Sd ../lav/chr* > chrom.list
    gensub2 chrom.list single gsub jobList
    wc -l jobList
    head jobList
    para create jobList
    para try, check, push, check,...
#Completed: 53 of 54 jobs
#Crashed: 1 jobs
#Average job time:                  30s       0.51m     0.01h    0.00d
#Longest job:                      541s       9.02m     0.15h    0.01d
#Submission to last job:           576s       9.60m     0.16h    0.01d
    # The crash was not a memory problem -- it was that chr8_random's lav 
    # contained only 1 alignment, to itself (dropped), so its .axt is empty.  


# CHAIN SELF BLASTZ (DONE 3/3/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
axtFilter -notQ=chrUn $1 \
| axtChain stdin /iscratch/i/galGal2/nib /iscratch/i/galGal2/nib $2 \
  > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
    # oops -- empty chr8_random strikes again:
#Completed: 53 of 54 jobs
#Crashed: 1 jobs
#Average job time:                  53s       0.89m     0.01h    0.00d
#Longest job:                      514s       8.57m     0.14h    0.01d
#Submission to last job:           514s       8.57m     0.14h    0.01d

    # now on the cluster server, sort chains
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # trim to minScore=20000 to cut some of the fluff
    mkdir chainFilt
    foreach f (chain/*.chain)
      chainFilter -minScore=20000 $f > chainFilt/$f:t
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain/chainFilt
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain galGal2 ${c}_chainSelf $i
    end


# NET SELF BLASTZ (DONE 3/3/04 angie)
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    netClass -noAr noClass.net galGal2 galGal2 self.net

    # Make a 'syntenic' subset:
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    rm noClass.net
    netFilter -syn self.net > selfSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    netFilter -minGap=10 self.net |  hgLoadNet galGal2 netSelf stdin
    netFilter -minGap=10 selfSyn.net | hgLoadNet galGal2 netSyntenySelf stdin
    # Add entries for chainSelf, netSelf, netSyntenySelf to 
    # chicken/galGal2 trackDb


# MAKE VSSELF DOWNLOADABLES (DONE 3/3/04 angie)
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    # Webb asked for axtChrom/chr22.axt... since axtChrom is rel. small 
    # this time, just put it all out there.
    zip /cluster/data/galGal2/zip/GGaxtChrom.zip axtChrom/chr*.axt
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03/axtChain
    cp all.chain self.chain
    zip /cluster/data/galGal2/zip/self.chain.zip self.chain
    rm self.chain
    zip /cluster/data/galGal2/zip/self.net.zip self.net
    zip /cluster/data/galGal2/zip/selfSyn.net.zip selfSyn.net

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsSelf
    cd /usr/local/apache/htdocs/goldenPath/galGal2/vsSelf
    mv /cluster/data/galGal2/zip/GGaxtChrom.zip axtChrom.zip
    mv /cluster/data/galGal2/zip/self*.zip .
    md5sum *.zip > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# EXTRACT PILEUPS FROM SELF ALIGNMENTS (DONE 3/9/04 angie)
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    mkdir pslChrom
    foreach f (axtChrom/chr*.axt)
      echo $f:t:r
      axtToPsl $f S1.len S2.len pslChrom/$f:t:r.psl
    end
    mkdir pileups
    foreach f (pslChrom/chr*.psl)
      echo extracting pileups from $f:t:r
      pslToPileup $f S1.len pileups/$f:t:r.bed
    end
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.galGal2.2004-03-03
    hgLoadBed -noBin -sqlTable=$HOME/kent/src/hg/lib/pileups.sql galGal2 \
      pileups pileups/chr*.bed
    # Used table browser to intersect pileups w/>=50% overlap with rmsk,
    # save BED to pileups50PctRmsk.bed .


# AUTO UPDATE GENBANK MRNA RUN  (DONE 2/26/04 angie)
    # Update genbank config and source in CVS:
    cd ~/kent/src/hg/makeDb/genbank
    cvsup .
    # See if /cluster/data/genbank/etc/genbank.conf has had any un-checked-in
    # edits, check them in if necessary:
    diff /cluster/data/genbank/etc/genbank.conf etc/genbank.conf

    # Edit etc/genbank.conf and add these lines:
# galGal2 (chicken)
galGal2.genome = /iscratch/i/galGal2/nib/chr*.nib
galGal2.lift = /cluster/data/galGal2/jkStuff/liftAll.lft
galGal2.refseq.mrna.native.load = no
galGal2.genbank.mrna.xeno.load = yes
galGal2.genbank.est.xeno.load = no
galGal2.downloadDir = galGal2

    cvs ci etc/genbank.conf
    # Edit src/align/gbBlat to add /iscratch/i/galGal2/11.ooc
    cvs diff src/align/gbBlat
    make
    cvs ci src/align/gbBlat
    # Install to /cluster/data/genbank:
    make install-server

    ssh eieio
    cd /cluster/data/genbank
    # This is an -initial run, mRNA only:
    nice bin/gbAlignStep -clusterRootDir=/cluster/bluearc/genbank \
      -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \
      -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \
      -srcDb=genbank -type=mrna -verbose=1 -initial galGal2
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 -drop -initialLoad galGal2
    # Clean up:
    rm -r /cluster/bluearc/genbank/work/initial.galGal2

    ssh eieio
    # -initial for ESTs (now with /cluster/store7 and iservers):
    nice bin/gbAlignStep -clusterRootDir=/cluster/store7/genbank \
      -iserver=kkr1u00 -iserver=kkr2u00 -iserver=kkr3u00 -iserver=kkr4u00 \
      -iserver=kkr5u00 -iserver=kkr6u00 -iserver=kkr7u00 -iserver=kkr8u00 \
      -srcDb=genbank -type=est -verbose=1 -initial galGal2
    # Load results:
    ssh hgwdev
    cd /cluster/data/genbank
    nice bin/gbDbLoadStep -verbose=1 galGal2
    # Clean up:
    rm -r /cluster/store7/genbank/work/initial.galGal2


# PRODUCING GENSCAN PREDICTIONS (DONE 2/25/04 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/genscan
    cd /cluster/data/galGal2/bed/genscan
    # Check out hg3rdParty/genscanlinux to get latest genscan:
    cvs co hg3rdParty/genscanlinux
    # Run on small cluster (more mem than big cluster).
    ssh kki
    cd /cluster/data/galGal2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/data/galGal2/*/chr*_*/chr*_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    wc -l genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 genome.list single gsub jobList
    para create jobList
    para try, check, push, check, ...
#Completed: 254 of 259 jobs
#Crashed: 5 jobs
#Average job time:                 438s       7.29m     0.12h    0.01d
#Longest job:                    21229s     353.82m     5.90h    0.25d
#Submission to last job:         21229s     353.82m     5.90h    0.25d
    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    /cluster/bin/i386/gsBig /cluster/data/galGal2/1/chr1_4/chr1_4.fa.masked gtf/chr1_4.fa.gtf -trans=pep/chr1_4.fa.pep -subopt=subopt/chr1_4.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/i386/gsBig /cluster/data/galGal2/7/chr7_5/chr7_5.fa.masked gtf/chr7_5.fa.gtf -trans=pep/chr7_5.fa.pep -subopt=subopt/chr7_5.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/i386/gsBig /cluster/data/galGal2/11/chr11_2/chr11_2.fa.masked gtf/chr11_2.fa.gtf -trans=pep/chr11_2.fa.pep -subopt=subopt/chr11_2.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/i386/gsBig /cluster/data/galGal2/1/chr1_36/chr1_36.fa.masked gtf/chr1_36.fa.gtf -trans=pep/chr1_36.fa.pep -subopt=subopt/chr1_36.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000
    /cluster/bin/i386/gsBig /cluster/data/galGal2/Z/chrZ_6/chrZ_6.fa.masked gtf/chrZ_6.fa.gtf -trans=pep/chrZ_6.fa.pep -subopt=subopt/chrZ_6.fa.bed -exe=hg3rdParty/genscanlinux/genscan -par=hg3rdParty/genscanlinux/HumanIso.smat -tmp=/tmp -window=1200000

    # Convert these to chromosome level files as so:
    ssh kksilo
    cd /cluster/data/galGal2/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/genscan
    ldHgGene galGal2 genscan genscan.gtf
    hgPepPred galGal2 generic genscanPep genscan.pep
    hgLoadBed galGal2 genscanSubopt genscanSubopt.bed


# PRODUCING FUGU BLAT ALIGNMENTS (DONE 2/27/04)
    ssh kk
    mkdir /cluster/data/galGal2/bed/blatFr1
    cd /cluster/data/galGal2/bed/blatFr1
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
    ls -1S /iscratch/i/galGal2/nib/*.nib > chicken.lst
    cat << '_EOF_' > gsub
#LOOP
blat -mask=lower -q=dnax -t=dnax {check in exists $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    mkdir psl
    gensub2 chicken.lst fugu.lst gsub spec
    para create spec
    para try, check, push, check, ...
    # for future reference, the trfFa on bluearc seem better partitioned...
#Completed: 31212 of 31212 jobs
#Average job time:                 187s       3.11m     0.05h    0.00d
#Longest job:                    31706s     528.43m     8.81h    0.37d
#Submission to last job:         31706s     528.43m     8.81h    0.37d

    # Sort alignments:
    ssh kksilo
    cd /cluster/data/galGal2/bed/blatFr1
    pslCat -dir psl | pslSortAcc nohead chrom /cluster/store2/temp stdin

    # load seq & psl into database:
    ssh hgwdev
    mkdir /gbdb/galGal2/fuguSeq
    ln -s /cluster/data/fr1/fugu_v3.masked.fa /gbdb/galGal2/fuguSeq/
    cd /cluster/data/galGal2/bed/blatFr1
    hgLoadSeq galGal2 /gbdb/galGal2/fuguSeq/fugu_v3.masked.fa
    cd /cluster/data/galGal2/bed/blatFr1/chrom
    cat *.psl | hgLoadPsl -fastLoad -table=blatFr1 galGal2 stdin


# LOAD CPGISSLANDS (DONE 3/23/04 angie)
    ssh hgwdev
    mkdir -p /cluster/data/galGal2/bed/cpgIsland
    cd /cluster/data/galGal2/bed/cpgIsland
    # Build software from Asif Chinwalla (achinwal@watson.wustl.edu)
    cvs co hg3rdParty/cpgIslands
    cd hg3rdParty/cpgIslands
    make
    mv cpglh.exe /cluster/data/galGal2/bed/cpgIsland/
    
    ssh kksilo
    cd /cluster/data/galGal2/bed/cpgIsland
    foreach f (../../*/chr*.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpglh.exe $f > $fout
    end
    # Transform cpglh output to bed +
    cat << '_EOF_' > filter.awk
/* Input columns: */
/* chrom, start, end, len, CpG: cpgNum, perGc, cpg:gpc, observed:expected */
/* chr1\t 41776\t 42129\t 259\t CpG: 34\t 65.8\t 0.92\t 0.94 */
/* Output columns: */
/* chrom, start, end, name, length, cpgNum, gcNum, perCpg, perGc, obsExp */
/* chr1\t41775\t42129\tCpG: 34\t354\t34\t233\t19.2\t65.8\to0.94 */
{
$2 = $2 - 1;
width = $3 - $2;
printf("%s\t%d\t%s\t%s %s\t%s\t%s\t%0.0f\t%0.1f\t%s\t%s\n",
       $1, $2, $3, $5,$6, width,
       $6, width*$7*0.01, 100.0*2*$6/width, $7, $9);
}
'_EOF_'
    # << this line makes emacs coloring happy
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIsland
    hgLoadBed galGal2 cpgIslandExt -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIslandExt.sql cpgIsland.bed


# LOAD GENEID GENES (DONE 3/4/04 angie)
    # reloaded 3/16/04 with -gtf instead of -exon=CDS (nec. now! for stop_codon)
    mkdir -p /cluster/data/galGal2/bed/geneid/download
    cd /cluster/data/galGal2/bed/geneid/download
    foreach f (/cluster/data/galGal2/*/chr*.fa)
      set chr = $f:t:r
      wget \
http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/geneid_v1.1/$chr.gtf
      wget \
http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/geneid_v1.1/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene -gtf galGal2 geneid download/*.gtf
    hgPepPred galGal2 generic geneidPep download/*-fixed.prot

# QA NOTE [ASZ: 9-11-2006]: mytouch for geneidPep
# sudo mytouch galGal2 geneidPep 200403162100.00


# BUILD BLAST DATABASES
    cd /cluster/data/galGal2
    mkdir blastDb
     for i in `cat chrom.lst`; do for j in `echo $i/chr$i_*/chr*_*_*.fa`; do ln -s `pwd`/$j blastDb/`basename $j .fa`;
      done; done 
      cd blastDb
      for i in *; do formatdb -p F -i $i; done
# END BLAST


# TIGR GENE INDEX (TODO)
    mkdir -p /cluster/data/galGal2/bed/tigr
    cd /cluster/data/galGal2/bed/tigr
    wget ftp://ftp.tigr.org//pub/data/tgi/TGI_track_ChickenGenome_Jun2003.tgz
    tar xvfz TGI_track_ChickenGenome_Jun2003.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (rat cow human pig chicken)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC galGal2 tigrGeneIndex *.gff
       # 139456 gene predictions
    gzip *TCs *.gff


# GC 5 BASE WIGGLE TRACK (DONE 2/24/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/gc5Base
    cd /cluster/data/galGal2/bed/gc5Base
    mkdir wigData5 dataLimits5
    foreach n (../../nib/*.nib)
      set c=`basename ${n} | sed -e "s/.nib//"`
      set C=`echo $c | sed -e "s/chr//"`
      echo "working on ${c} - ${C} ... "
      hgGcPercent -chr=${c} -doGaps -file=stdout -win=5 galGal2 ../../nib \
      | grep -w GC \
      | awk '{ \
               bases = $3 - $2 \
               perCent = $5/10.0 \
               printf "%d\t%.1f\n", $2+1, perCent \
             }' \
      | wigAsciiToBinary \
        -dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
        -name=${C} stdin > dataLimits5/${c}
    end
    # data is complete, load track on hgwdev
    ssh hgwdev
    cd /cluster/data/galGal2/bed/gc5Base
    # create symlinks for .wib files, then load track
    mkdir /gbdb/galGal2/wib
    ln -s `pwd`/wigData5/*.wib /gbdb/galGal2/wib
    hgLoadWiggle galGal2 gc5Base wigData5/*.wig


# SUPERCONTIG LOCATIONS TRACK (DONE 2/26/04 angie)
    # Supercontig N is a collection of contained "ContigN.*".  
    # Extract Supercontig starts and ends from AGP into bed.  
    mkdir /cluster/data/galGal2/bed/supercontig
    cd /cluster/data/galGal2/bed/supercontig
    cat ../../agp_040224/chr*.agp \
    | perl -we 'while (<>) { \
                  chop; @words = split("\t"); \
                  next if ($words[4] eq "N"); \
                  if ($words[5] =~ /^(Contig\d+)\.(\d+)$/) { \
                    $sup = $1;  $newNum = $2; \
                    $firstNum = $newNum if (! defined $firstNum); \
                    if (defined $lastSup && $lastSup ne $sup) { \
                      $start--; \
                      print "$chr\t$start\t$end\t$lastSup ($firstNum-$lastNum)\t1000\t$strand\n"; \
                      undef $start;  undef $end;  $firstNum = $newNum; \
                    } \
                    $chr = $words[0]; $cStart = $words[1]; $cEnd = $words[2]; \
                    $strand = $words[8]; \
                    $start = $cStart if (\!defined $start || $start > $cStart); \
                    $end = $cEnd if (\!defined $end || $end < $cEnd); \
                    $lastSup = $sup;  $lastNum = $newNum; \
                  } else { die "bad 6th word $words[5]\n"; } \
                } \
                print "$chr\t$start\t$end\t$lastSup ($firstNum-$lastNum)\t1000\t$strand\n";' \
    > supercontig.bed
    ssh hgwdev
    cd /cluster/data/galGal2/bed/supercontig
    hgLoadBed -tab galGal2 supercontig supercontig.bed


# CREATING QUALITY SCORE TRACK (DONE 2/24/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/qual
    cd /cluster/data/galGal2/bed/qual
    cat ../../agp_040224/chr*.agp > assembly.agp
    tar xvzf ../../chicken_040105.contigs.qual.tar.gz
    cat chicken*.qual | qaToQac stdin stdout \
    | chimpChromQuals assembly.agp stdin chrom.qac
    rm chicken*.qual
    mkdir wigData dataLimits
    foreach c (`cat ../../chrom.lst`)
      foreach agp (../../$c/chr$c.agp ../../$c/chr${c}_random.agp)
        if (-e $agp) then
          set chr = $agp:t:r
          set abbrev = `echo $chr | sed -e 's/^chr//;  s/_random/r/;'`
          echo $chr to $abbrev wiggle
          qacToWig chrom.qac -name=$chr stdout \
          | wigAsciiToBinary -dataSpan=1 -chrom=$chr \
              -wibFile=wigData/qual_$abbrev -name=$abbrev stdin \
              > dataLimits/$chr
        endif
      end
    end
    # Verify size of .wib file = chrom length
    foreach f (wigData/*.wib)
      set abbrev = $f:t:r
      set chr = `echo $abbrev | sed -e 's/^qual_/chr/;  s/r$/_random/;'`
      set wibLen = `ls -l $f | awk '{print $5;}'`
      set chromLen = `grep -w $chr ../../chrom.sizes | awk '{print $2;}'`
      if ($wibLen != $chromLen) then
        echo "ERROR: $chr size is $chromLen but wib size is $wibLen"
      else
        echo $chr OK.
      endif
    end
#ERROR: chr12 size is 19821895 but wib size is 19811895
#ERROR: chr23 size is 5666127 but wib size is 5166127
#ERROR: chr28 size is 4731479 but wib size is 4231479
    # That's OK -- they end in gaps of the right sizes (not incl'd in wib)

    # /gbdb & load:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/qual
    mkdir -p /gbdb/galGal2/wib
    ln -s `pwd`/wigData/*.wib /gbdb/galGal2/wib
    hgLoadWiggle galGal2 quality wigData/*.wig

    # To speed up display for whole chrom views, need to make zoomed
    # data views.  Zoom to 1K points per pixel.
    ssh kksilo
    cd /cluster/data/galGal2/bed/qual/wigData1K
    mkdir -p wigData1K
    mkdir -p dataLimits1K
    foreach c (`cat ../../chrom.lst`)
      if (-f ../../${c}/chr${c}.agp) then
        echo "chr${c} quality 1K zoom"
        qacToWig chrom.qac -name=chr${c} stdout \
        | wigZoom stdin \
        | wigAsciiToBinary -dataSpan=1024 \
                -chrom=chr${c} -wibFile=wigData1K/qc1K_${c} \
                -name=${c} stdin > dataLimits1K/chr${c}
      endif
      if (-f ../../${c}/chr${c}_random.agp) then
        echo "chr${c}_random quality 1K zoom"
        qacToWig chrom.qac -name=chr${c}_random stdout \
        | wigZoom stdin \
        | wigAsciiToBinary -dataSpan=1024 \
                -chrom=chr${c}_random -wibFile=wigData1K/qc1K_${c}r \
                -name=${c}_random stdin > dataLimits1K/chr${c}r
      endif
    end

    ssh hgwdev
    cd /cluster/data/galGal2/bed/qual/wigData1K
    # create symlinks for .wib files
    ln -s `pwd`/*.wib /gbdb/galGal2/wib
    # load in addition to existing data
    hgLoadWiggle -oldTable galGal2 quality *.wig


# MAKE UNIGENE ALIGNMENTS (DONE 2/29/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/unigene
    cd /cluster/data/galGal2/bed/unigene
    # Download chicken Unigene and run Chuck's preproc scripts:
    wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.info
    wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.seq.uniq.gz
    wget ftp://ftp.ncbi.nih.gov/repository/UniGene/Gga.data.gz
    gunzip *.gz
    # Chuck's script expects human (Hs) -- use Gga for chicken:
    sed -e 's/Hs/Gga/g' /projects/cc/hg/sugnet/uniGene/countSeqsInCluster.pl \
    > ./countSeqsInCluster.pl
    chmod a+x ./countSeqsInCluster.pl
    ./countSeqsInCluster.pl Gga.data counts.tab
    /projects/cc/hg/sugnet/uniGene/parseUnigene.pl Gga.seq.uniq \
      Gga.seq.uniq.simpleHeader.fa leftoverData.tab
    # Put on iscratch for cluster run
    ssh kkr1u00
    mkdir /iscratch/i/galGal2/unigene
    cp -p /cluster/data/galGal2/bed/unigene/Gga.seq.uniq.simpleHeader.fa \
      /iscratch/i/galGal2/unigene/
    iSync
    # align on big cluster
    ssh kk
    cd /cluster/data/galGal2/bed/unigene
    ls -1S /iscratch/i/galGal2/trfFa/*.fa > allctg.lst
    ls -1S /iscratch/i/galGal2/unigene/*.fa > uniGene.lst
# seems to be a bug in latest blat: -mask=lower => 0 alignments!  
# for now, use Jim's next-to-latest blat.27:
    cat << '_EOF_' > template.sub
#LOOP
/cluster/home/kent/bin/i386/blat.27 -mask=lower -minIdentity=95 -ooc=/iscratch/i/galGal2/11.ooc {check in line+ $(path1)} {check in line+ $(path2)}  {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy
    gensub2 allctg.lst uniGene.lst template.sub jobList
    para create jobList
    mkdir psl
    para try, check, push, check
#Completed: 259 of 259 jobs
#Average job time:                  13s       0.22m     0.00h    0.00d
#Longest job:                       21s       0.35m     0.01h    0.00d
#Submission to last job:            22s       0.37m     0.01h    0.00d
    # postprocess
    ssh kksilo
    pslSort dirs raw.psl tmp psl >& pslSort.log
    liftUp -type=.psl stdout ../../jkStuff/liftAll.lft warn raw.psl \
    | pslReps -minCover=0.2 -minAli=0.965 -nearTop=0.002 \
      stdin uniGene.lifted.pslReps.psl /dev/null
    rm raw.psl
    gzip Gga.seq.uniq Gga.data 
    # load into db
    ssh hgwdev
    cd /cluster/data/galGal2/bed/unigene
    mkdir /gbdb/galGal2/unigene
    ln -s /cluster/data/galGal2/bed/unigene/Gga.seq.uniq.simpleHeader.fa \
      /gbdb/galGal2/unigene/
    hgLoadSeq galGal2 /gbdb/galGal2/unigene/Gga.seq.uniq.simpleHeader.fa
    hgLoadPsl -fastLoad -table=uniGene_gg galGal2 uniGene.lifted.pslReps.psl


# LIFTOVER ENCODE REGIONS FOR JOHN WALLIS (DONE 3/2/04 angie)
    # It's too early to show these in the browser, but try to liftOver:
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/encodeLiftOver
    cd /cluster/data/galGal2/bed/encodeLiftOver
    echo 'select * from hg16.encodeRegions' | hgsql hg16 -N > hg16.eR.bed
    # Of 44 regions:
    # -minMatch=.95 (default) -> nothing mapped
    # -minMatch=.1 -> 2 mapped
    # -minMatch=.01 -> 33 mapped
    # -minMatch=.001 -> 14 mapped -- huh??? -> "Duplicated in new."
    # -minMatch=.011 -> 34 mapped -- best I could get.
    liftOver -minMatch=.011 hg16.eR.bed \
      /cluster/data/hg16/bed/bedOver/hg16TogalGal2.chain \
      galGal2.encodeRegions.bed galGal2.encodeRegions.unmapped
    # emailed the files to John Wallis w/warning they're prelim, automated.
    # This is experimental, so no track is put up yet, (2004-07-9 kate)
    cd /cluster/data/galGal2/bed
    ln -s encodeLiftOver encodeRegions
    cd encodeRegions
    ln -s galGal2.encodeRegions.bed encodeRegions.bed
    hgLoadBed galGal2 encodeRegions encodeRegions.bed -noBin


# SNP & GENES FROM BEIJING GENOME INST. (DONE 7/20/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/bgi
    cd /cluster/data/galGal2/bed/bgi
    # Their files are very fragmented into a deep dir structure.  
    # Pull the whole thing (except chromatograms) over intact.  
    wget -r --exclude-directories=\*/chromatograms \
      --timestamping ftp://genome.wustl.edu/private/BGI/
    # Takes hours...
    # Zhang Yong said to ignore the target regions, so move the dir 
    # out of the way for now...
    mv genome.wustl.edu/private/BGI/SNPs_by_locus/ .

    # Coverage track: bed 4 showing where SNP reads were performed.
    # Looks like we can get strain info from filename... 
    find genome.wustl.edu/private/BGI/SNPs_by_strain/2004-07-07 \
       -name \*ContigCov-\*.txt\* -print \
    | grep -v /BAC-endpairs/ \
    | xargs $HOME/kent/src/hg/snp/bgi/bgiCovToBed.pl \
      > bgiCov.bed

    # Gene annotations:
    find genome.wustl.edu/private/BGI/external_data/2004-05-11 \
      -name \*exon_intron.txt -print \
    | sed -e 's/ /\\ /g' \
    | xargs $HOME/kent/src/hg/snp/bgi/bgiExonIntronToGenePred.pl \
      > bgiGene.tab

    # Gene annotation peptide seqs:
    find genome.wustl.edu/private/BGI/external_data/2004-05-11 \
      -name \*.pep\* -print \
    | sed -e 's/ /\\ /g' \
    | xargs $HOME/kent/src/hg/snp/bgi/bgiPep.pl \
      > bgiGenePep.fa

    # Gene info and SNPs: slurp in all of the GeneSNPcon* files and 
    # SNPlist-* files, cross-reference, and dump out bgiGeneInfo.sql 
    # and bgiSnp.bed.  (bgiGeneInfo.sql with INSERT statements because of 
    # the longblobs which can't be loaded in from .tab unfortunately)
    find genome.wustl.edu/private/BGI/SNPs_by_strain/2004-07-07 \
      -name SNPlist-\*.txt\* -print \
      | grep -v /BAC-endpairs/ \
      > SNPlists.list
    find genome.wustl.edu/private/BGI/SNPs_by_cDNA/2004-07-07 \
      -name \*GeneSNPcon\*.txt\* -print \
      > GeneSNPcons.list
    # Takes about 15 minutes:
    time $HOME/kent/src/hg/snp/bgi/bgiXref.pl SNPlists.list GeneSNPcons.list
    # Tons of duplicated gene-snp association warnings like this:
#dup assoc ENS.ENSGALT00000000020        snp.117.58.6795.S.2:
#  ENS.ENSGALT00000000020        snp.117.58.6795.S.2     Intron
#  ENS.ENSGALT00000000020        snp.117.58.6795.S.2     Splice Site     AG->AA
    # which I think is OK.  Also duplicated gene sources (asked Zhang Yong):
#dup:    HDG.ENSG00000005381     HumanDiseaseGene        HumanDiseaseGene

    # Load 'em up...
    ssh hgwdev
    cd /cluster/data/galGal2/bed/bgi
    hgLoadBed galGal2 bgiCov bgiCov.bed
    hgsql galGal2 < $HOME/kent/src/hg/lib/bgiGeneInfo.sql
    hgsql galGal2 < ./bgiGeneInfo.sql
    hgsql galGal2 < $HOME/kent/src/hg/lib/bgiGeneSnp.sql
    echo "load data local infile './bgiGeneSnp.tab' into table bgiGeneSnp" \
    | hgsql galGal2
    hgPepPred galGal2 generic bgiGenePep bgiGenePep.fa
    ldHgGene galGal2 bgiGene -predTab bgiGene.tab
    hgLoadBed galGal2 bgiSnp -tab -sqlTable=$HOME/kent/src/hg/lib/bgiSnp.sql \
      bgiSnp.bed


# ENSEMBL GENE BUILD (DONE 4/27/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/ensembl
    cd /cluster/data/galGal2/bed/ensembl
    wget ftp://ftp.sanger.ac.uk/pub/searle/chicken/chicken_build.pep.gz
    wget ftp://ftp.sanger.ac.uk/pub/searle/chicken/chicken_build.gtf.gz
    gunzip *.gz
    sed -e 's/^/chr/' chicken_build.gtf \
      > chicken_build.fixed.gtf
    perl -wpe 's/^>.*trans_id=(\w+).*/>$1/' \
      chicken_build.pep > chicken_build.fixed.pep
    ssh hgwdev
    cd /cluster/data/galGal2/bed/ensembl
    hgPepPred galGal2 generic ensPep chicken_build.fixed.pep
    ldHgGene -gtf -frame -id -geneName galGal2 ensGene chicken_build.fixed.gtf


# MAKE KNOWN REGULATORY REGIONS (DONE 3/29/04 angie)
    # Adam got several text files from Webb (and/or Laura Elnitski?), 
    # translated them to bed 3, and loaded.  I tweaked -> bed 4.  
    # (this does both hg16 and galGal2)
    ssh hgwdev
    cd /cluster/home/acs/webb-launcher
    make -f Makefile.angie munge
    make -f Makefile.angie track


# RUN GENE-CHECK ON MRNA W/CDS, ENSEMBL (DONE 3/29/04 markd)
    # Note: /cluster/bin/scripts/runGeneCheck should make this easier 
    # in the future, but this also shows how MarkD ran gene-check on 
    # mRNAs with annotated CDS:

    # get mrna data from hgwbeta, which is most current:
    ssh hgwbeta
    mkdir /cluster/store7/markd/galGal2
    cd /cluster/store7/markd/galGal2
    mkdir data
    hgsql -N -e 'select * from all_mrna' galGal2 |cut -f 2-30 \
      > data/all_mrna.psl
    hgsql -N -e 'select mrna.acc,cds.name from mrna,cds,all_mrna where all_mrna.qName=mrna.acc and mrna.cds = cds.id' galGal2 \
      > data/cds.tab
    nice mrnaToGene -cdsFile=data/cds.tab data/all_mrna.psl data/all_mrna.gp \
      >& data/all_mrna.log

    # get ensGene data from hgwdev:
    ssh hgwdev
    cd /cluster/store7/markd/galGal2
    hgsql -N -e 'select * from ensGene ' galGal2 >data/ensGene.gp

    # run gene-check and gene-extract
    ssh kksilo
    cd /cluster/store7/markd/galGal2
    mkdir results
    ~markd/bin/gene-check -incl-ok -nib-dir /cluster/data/galGal2/nib \
      -details-out results/all_mrna.details data/all_mrna.gp \
      results/all_mrna.chk >& all_mrna.log
    ~markd/bin/gene-check -incl-ok -nib-dir /cluster/data/galGal2/nib \
      -details-out results/ensGene.details data/ensGene.gp \
      results/ensGene.chk >& ensGene.log
    ~markd/bin/gene-extract -nib-dir  /cluster/data/galGal2/nib \
      -all-cds-fa results/all_mrna.cds.fa  data/all_mrna.gp >& all_mrna.log
    ~markd/bin/gene-extract -nib-dir  /cluster/data/galGal2/nib \
      -all-cds-fa results/ensGene.cds.fa  data/ensGene.gp >&ensGene.log
    # Files in results/
    #     *.chk - output of the gene-check program, rdb format
    #     *.details - details of problems detected 
    #     *.cds.fa - fasta of CDS taken from genome
    # Columns in *.chk file are
    #     acc - name field from genePred
    #     chr  - genome location from genePred
    #     chrStart
    #     chrEnd
    #     strand
    #     stat - overall status based on other data: ok or err
    #     frame - is the frame annotation sane (mult of 3): ok, bad, or noCDS
    #             if noCDS, other CDS checks are not done.
    #     start - Is there a valid start codon: ok or no
    #     stop - Is there a valid stop codon: ok or no
    #     orfStop - Number of in-frame stop codons
    #     smallGap - Number of small gaps, too small to be introns
    #     unknownSplice - Number of introns with unknown splicing patterns 
    #          (considers the three most common splicing patterns as valid)
    #     causes - list of symbolic names summarizing the failures 


# WEBB'S PUTATIVE NON-EXONIC CONSERVED REGIONS (REDONE 6/26/04 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/webbNonExonic
    cd /cluster/data/galGal2/bed/webbNonExonic
    # saved email attachment from Webb Miller (5/26/04)
    awk 'BEGIN {i=1;} {print $1 "\t" $2-1 "\t" $3 "\t" $1 "." i; i++;}' \
      chickenspans2.txt > webbNonExonic.bed
    hgLoadBed galGal2 webbNonExonic webbNonExonic.bed


# UN-ANNOTATED (EXCEPT FOR CROSS-SPECIES) REGIONS (DONE 4/27/04 angie)
    # Anton Nekrutenko asked the chickenanalysis list for this... sounds 
    # like a job for featureBits.  :)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/unAnnotated
    cd /cluster/data/galGal2/bed/unAnnotated
    nice featureBits galGal2 -minSize=12 \
      \!gap \!bgiGene \!ensGene \!geneid \!genscan \!twinscan \
      \!mrna \!intronEst \!est \!xenoMrna \
      \!uniGene_gg \!cpgIslandExt \!rmsk \!simpleRepeat \
      -bed=unAnnotated.bed
#826274476 bases of 1054197620 (78.379%) in intersection
    hgLoadBed galGal2 unAnnotated unAnnotated.bed
    # not much of a drop in coverage with the -minSize:
    nice featureBits galGal2 unAnnotated
#825770129 bases of 1054197620 (78.332%) in intersection


# SYNTENY BY HOMOLOGY FROM EVGENY ZDOBNOV (DONE 4/15/04 angie)
    # Note 6/3/04: This has been replaced by
    # http://azra.embl-heidelberg.de/~zdobnov/Chicken/chicken-human-mm3_mouse-rat_synteny.txt
    # -- but unfortunately that only has start coords, no ends (or lengths).
    # could get lengths from ensmart... for all 4 genomes... see if there's 
    # any demand/interest.
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/zdobnov
    cd /cluster/data/galGal2/bed/zdobnov
    wget --timestamping \
      http://azra.embl-heidelberg.de/~zdobnov/Chicken/homo_chicken_synteny.txt
    wget --timestamping \
      http://azra.embl-heidelberg.de/~zdobnov/Chicken/mouse_chicken_synteny.txt
    wget --timestamping \
      http://azra.embl-heidelberg.de/~zdobnov/Chicken/rat_chicken_synteny.txt
    # This is a one-shot, so I'm writing a perl script...
    transZdobnov.pl hg16 homo_chicken_synteny.txt  > zdobnovHg16.bed
    transZdobnov.pl mm3  mouse_chicken_synteny.txt > zdobnovMm3.bed
    transZdobnov.pl rn3  rat_chicken_synteny.txt   > zdobnovRn3.bed
    ssh hgwdev
    cd /cluster/data/galGal2/bed/zdobnov
    sed -e 's/zdobnovSynt/zdobnovHg16/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \
      > zd.sql
    hgLoadBed -sqlTable=zd.sql galGal2 zdobnovHg16 zdobnovHg16.bed
    sed -e 's/zdobnovSynt/zdobnovMm3/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \
      > zd.sql
    hgLoadBed -sqlTable=zd.sql galGal2 zdobnovMm3 zdobnovMm3.bed
    sed -e 's/zdobnovSynt/zdobnovRn3/' $HOME/kent/src/hg/lib/zdobnovSynt.sql \
      > zd.sql
    hgLoadBed -sqlTable=zd.sql galGal2 zdobnovRn3 zdobnovRn3.bed


# FILTER MRNAS FOR CHECKING ENSEMBL GENES (DONE 4/12/04 angie)
    # Use the TB to generate BED for mRNA alignments with no overlap w/ensGene.
    # saved as galGal2.mrnaNotEnsGene.bed (Sent to Ewan 4/6/04 -- needed filt)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/tightMrna
    cd /cluster/data/galGal2/bed/tightMrna
    # Get list of mRNA accs from the chEST project, which may be a little 
    # more shaky than others (?).
    echo 'select mrna.acc from mrna,description \
          where mrna.description = description.id and \
          description.name like "Gallus gallus finished cDNA, clone ChEST%.";' \
    | use galGal2 -N > chESTaccs.txt
    ssh kksilo
    cd /cluster/data/galGal2/bed/tightMrna
    # Make a uniq'd list of accs that didn't overlap ensGene:
    awk '{print $4;}' galGal2.mrnaNotEnsGene.bed | sort | uniq > mNotE.txt
    # Separate those into chEST and non-chEST:
    cp /dev/null mNotEChEST.txt
    cp /dev/null mNotEOther.txt
    foreach a (`cat mNotE.txt`)
      if (`grep $a chESTaccs.txt` != "") then
        echo $a >> mNotEChEST.txt
      else
        echo $a >> mNotEOther.txt
      endif
    end
    wc -l mNot*.txt
#   2987 mNotE.txt
#   2484 mNotEChEST.txt
#    503 mNotEOther.txt
    # Get psl for those alignments using the TB (paste in those files as 
    # keyword lists).  (All fields but bin.)  Save as mNotEChEST.psl, 
    # mNotEOther.psl.  Edit the #chrom ... comments out of them.
    # Dang, extra tab at end causes trouble for pslReps, edit out:
    perl -wpe 's/\t$//' mNotEChEST.psl > tmp; mv tmp mNotEChEST.psl
    perl -wpe 's/\t$//' mNotEOther.psl > tmp; mv tmp mNotEOther.psl
    # Run pslReps with more stringent parameters than used for genbank:
    pslReps -minCover=0.50 -minAli=0.97 -nearTop=0.001 \
      mNotEChEST.psl mNotEChEST.filt.psl /dev/null
    pslReps -minCover=0.50 -minAli=0.97 -nearTop=0.001 \
      mNotEOther.psl mNotEOther.filt.psl /dev/null

    # Get overlap with human net
    netToBed -maxGap=1000 -minFill=12 \
      /cluster/data/galGal2/bed/blastz.hg16.swap.2004-02-25/axtChain/human.net \
      netHg16Cov.bed

    ssh hgwdev
    cd /cluster/data/galGal2/bed/tightMrna
    # Load these tables temporarily for TB usage...
    hgLoadBed galGal2 netHg16Cov netHg16Cov.bed
    hgLoadPsl galGal2 -table=mNotEChEST mNotEChEST.filt.psl
    hgLoadPsl galGal2 -table=mNotEOther mNotEOther.filt.psl
    featureBits galGal2 -or genscan geneid -bed=lotsaExons.bed
    hgLoadBed galGal2 lotsaExons lotsaExons.bed 

    # Use TB to keep mrna's with overlap w/netHg16Cov but no overlap w/ensGene.
    # Saved as mNotE{ChEST,Other}_pslReps_netHg16Cov_noEnsGene.bed
    # Use TB to overlap above w/lotsaExons (genscan | geneid).
    # Saved as mNotE{ChEST,Other}_pslReps_netHg16Cov_noEnsGene_abinit.bed
    # Made /usr/local/apache/htdocs/angie/ensPrelim/ w/index.html linking 
    # to custom track files and to browser w/custom tracks.

    # clean up.
    hgsql galGal2 -e 'drop table netHg16Cov; drop table lotsaExons'
    hgsql galGal2 -e 'drop table mNotEChEST; drop table mNotEOther'


# 5-SPECIES ORTHOLOGY FROM DEWEY/PACHTER (DONE 4/20/04 angie)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/deweySynt
    cd /cluster/data/galGal2/bed/deweySynt
    # Saved email attachment from Colin Dewey <cdewey@eecs.berkeley.edu>
    # as gg-hs-mm-pt-rn.map.gz
    gunzip gg-hs-mm-pt-rn.map.gz 
# format of the map file is:
# segment# chrom start end strand chrom start end strand ...
# where the columns chrom, start, end, strand are repeated for each 
# genome in the map (in this case we have 5*4 + 1 = 21 columns).  If a 
# segment is not found in one of the genomes, the columns for that genome 
# are set to "NA" for that segment.  The order of the genomes in the file is:
#   galGal2 hg16 mm3  rn3
# and all intervals are 0-based, half-open.
    # Make a bed6 track for each xenoDb.
    ./deweyMap2Bed6.pl 1 5  gg-hs-mm-pt-rn.map > deweySyntHg16.bed
    ./deweyMap2Bed6.pl 1 9  gg-hs-mm-pt-rn.map > deweySyntMm3.bed
    ./deweyMap2Bed6.pl 1 13 gg-hs-mm-pt-rn.map > deweySyntPanTro1.bed
    ./deweyMap2Bed6.pl 1 17 gg-hs-mm-pt-rn.map > deweySyntRn3.bed
    ssh hgwdev
    cd /cluster/data/galGal2/bed/deweySynt
    hgLoadBed galGal2 deweySyntHg16 deweySyntHg16.bed
    hgLoadBed galGal2 deweySyntMm3 deweySyntMm3.bed
    hgLoadBed galGal2 deweySyntPanTro1 deweySyntPanTro1.bed
    hgLoadBed galGal2 deweySyntRn3 deweySyntRn3.bed


# CHICKEN NCRNA FROM SAM GRIFFITHS-JONES (DONE 4/20/04 angie)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/ncRNA
    cd /cluster/data/galGal2/bed/ncRNA
    # saved email attachment from Sam Griffiths-Jones <sgj@sanger.ac.uk>
    # as gallus_gallus_ncrna_sgj_v1.gff
    # That's not GTF with exons/CDS/codons the way that ldHgGene expects,
    # so translate it into the rnaGene.as format:
    perl -wpe 'if (/^#/) { $_ = ""; next } chomp; \
               ($chr,$source,$type,$start,$end,undef,$strand,undef,$other) = split(/\t/); \
               $start--; \
               $fullScore = 0; \
               if ($other =~ /SCORE=([^;]+);/) { $fullScore = $1; } \
               $score = int($fullScore); \
               $source = "miRNA_Registry" if ($source eq "SGJ"); \
               if ($other =~ /ACC=([^;]+); ID=([^;]+);/) { $name = "$1:$2";} \
               elsif ($other =~ /ID=([^;]+);/) { $name = $1; } \
               elsif ($other =~ /TYPE=([^;]+);/) { $name = $1; } \
               else { $name = $type; } \
               $_ = "$chr\t$start\t$end\t$name\t$score\t$strand\t$source\t$type\t$fullScore\t0\n";' \
      gallus_gallus_ncrna_sgj_v1.gff > rnaGene.bed
    hgLoadBed galGal2 rnaGene -sqlTable=$HOME/kent/src/hg/lib/rnaGene.sql \
      -noBin rnaGene.bed
    # Downloaded new data 4/27/05... it's only the miRNA subset of the 
    # original combined {miRNA, Rfam, tRNAscan-SE} set.
    wget ftp://ftp.sanger.ac.uk/pub/databases/Rfam/miRNA/genomes/gga.gff


# TWINSCAN FROM BRENT LAB (DONE 5/1/04 angie - protein added 6/12/04)
    # contact: Paul Flicek <paul.flicek@wustl.edu>
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/twinscan
    cd /cluster/data/galGal2/bed/twinscan
    wget http://www.cs.wustl.edu/~pflicek/chicken/chicken.043004.ts.tar.gz
    tar xvzf chicken.043004.ts.tar.gz
    ldHgGene -gtf -frame -id -geneName galGal2 twinscan chr*.gtf
    runGeneCheck .
#   1173 frameMismatch
#    775 badFrame
#    477 noStop
#    237 gap
#    223 noStart
#     22 frameDiscontig
#      6 inFrameStop
    wget http://www.cs.wustl.edu/~pflicek/chicken/chicken.043004.ts.ptx.tar.gz
    tar xvzf chicken.043004.ts.ptx.tar.gz
    perl -wpe 's/^>(\S+)/>$1.a/' chr*.prot.fa > twinscanPep.fa
    hgPepPred galGal2 generic twinscanPep twinscanPep.fa



# ISOCHORES FROM NEKRUTENKO LAB (DONE 5/3/04 angie)
    # contact: Jianbin HE <jbhe@psu.edu>
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/isochore
    cd /cluster/data/galGal2/bed/isochore
    wget http://www.bx.psu.edu/~jbhe/isochore/chi-iso.tar.gz
    tar xvzf chi-iso.tar.gz
    # Remove the custom track "browser" and "track" lines:
    cat chr*.bed | egrep -v '^(browser|track)' > isochore.bed
    hgLoadBed galGal2 isochore isochore.bed


# SGP GENE PREDICTIONS (REDONE 9/27/04 angie - NEW DATA 1/4/06 angie)
    # contact: "\"C%G�%@mara, Francisco\"" <FCamara@imim.es> (Roderic Guigo's group)
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/sgp
    cd /cluster/data/galGal2/bed/sgp
    foreach chr (`awk '{print $1;}' ../../chrom.sizes`)
      wget http://genome.imim.es/genepredictions/G.gallus/golden_path_200402/SGP/humangp200405+Btau2/$chr.gtf
    end
    # Add .1 suffix to proteins to match gtf transcript_id's:
    cat chr*.prot | perl -wpe 's/^>(\w+)/>$1.1/' > sgpPep.fa
    # 1/4/06: get rid of redundant protein table:
    hgsql galGal2 -e 'drop table sgpPep'
    ldHgGene -gtf -genePredExt galGal2 sgpGene chr*.gtf


# CONTAMINATION (DONE 5/12/04)
    # emailed from LaDeana Hillier <lhillier@watson.wustl.edu>
    ssh hgwdev
    mkdir /cluster/data/galGal2/bed/contamination
    cd /cluster/data/galGal2/bed/contamination
    # saved email attachments to that dir:
    # weber_wgs_maize_mito.hits.txt, weber_wgs_vector_or_E.coli.txt,
    # chicken_accession_to_contig_mapping
    # One-shot perl script:
    chmod a+x mkContam.pl
    mkContam.pl weber_wgs_maize_mito.hits.txt weber_wgs_vector_or_E.coli.txt \
      chicken_accession_to_contig_mapping \
      ../../agp_040224/chr*.agp \
    > contamination.bed
    hgLoadBed galGal2 contamination \
      -noBin -sqlTable=$HOME/kent/src/hg/lib/contamination.sql \
      contamination.bed


# FORMAT MARKER DATA FROM WUSTL (IN PROGRESS 4/27/05)
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/markers
    cd /cluster/data/galGal2/bed/markers
    # Many files emailed from John Wallis <jwallis@watson.wustl.edu>
    # saved to:
    # accessionsDump040513.sql -- marker -> acc
    # geneticPositionsDump040513.sql -- marker, chrom, genPos, error, 
    #                                   primers(maybe), type(maybe), source
    # jacobssonPositionsDump040513.sql -- some overlap w/geneticPostions;
    #                                   marker, chrom, genPos, type, source
    # sequencesDump040513.sql
    # Uppsala_genetic.txt -- corrected positions for jacobssonPositions
    # GeneticMarkerAccID.txt -- marker -> acc, some overlap w/accessionsDump

    ssh hgwdev
    cd /cluster/data/galGal2/bed/markers
    hgsql -e "create database wallis"
    cat *.sql | hgsql wallis
    hgsql wallis <<_EOF_
create table uppsalaPositions (
    marker varchar(40) NOT NULL default '',
    chromosome varchar(40) NOT NULL default '',
    position int(11) default NULL,
    primary key (marker)
) TYPE=MyISAM;
load data local infile "Uppsala_genetic.txt" into table uppsalaPositions;
create table markerAcc (
    marker varchar(40) NOT NULL default '',
    acc varchar(40) NOT NULL default '',
    primary key (marker)
) TYPE=MyISAM;
load data local infile "GeneticMarkerAccID.txt" into table markerAcc;
_EOF_
    # extract sts marker sequences as fasta:
    echo 'select marker,sequence from sequences' \
    | hgsql -N wallis \
    | awk '{printf ">%s\n%s\n", $1, $2;}' > stsSequences.fa
    # extract primer pairs in ePCR format (works for isPCR too):
    echo 'select marker,primer1,primer2,length(sequence),source from sequences \
          where primer1 is not null and primer2 is not null' \
    | hgsql -N wallis \
    | awk '{printf "%s\t%s\t%s\t%d\t%s\n", $1, $2, $3, $4, $5;}' \
      > stsPrimers.epcr
    # Still need to extract aux. info... mainly need positions in various 
    # maps.  

    # Take a look at NCBI UniSTS too...
    ssh kolossus
    mkdir -p /santest/scratch/galGal2/sts
    cd /santest/scratch/galGal2/sts
    # This is the Wageningen map, with some UniSTS IDs associated with 
    # markers and distances:
    wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS_MapReports/Gallus_gallus/9031.WURC.txt
    # All UniSTS sequences and aliases... filter out from here:
    wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.sts
    wget ftp://ftp.ncbi.nih.gov/repository/UniSTS/UniSTS.aliases
    grep 'Gallus gallus' UniSTS.sts > gg.sts




# ANDY LAW CPGISSLANDS (DONE 6/16/04 angie)
    # Andy Law found 60k CpG islands while we found only 22k.  Andy sent 
    # his script (which needs another script/program to preprocess seq 
    # into CpG offsets, GC-since-last-CpG and length-since-last-CpG) to 
    # Laura Elnitski who sent it to us.  
    # --> /cluster/home/angie/andy-cpg-island.pl
    # His obs/exp calculation used a very rough approximation for number 
    # of C and number of G: length / 4 for each!!  
    # So I modified his script to take counts of C and G separately, 
    # and use the obs/exp from this '87 paper that everybody claims to follow 
    # (but they usually don't):
    # Gardiner-Garden and Frommer, CpG Islands in Vertebrate Genomes
    # J. Mol. Biol. (20 Jul. 1987)  196 (2), 261-282.
    # --> /cluster/home/angie/ggf-andy-cpg-island.pl
    # Had to write a oneshot program, preProcGgfAndy, to preprocess the 
    # fasta for the script.
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/cpgIslandGgfAndy
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandAndy.bed
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo preproc $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
      echo running original on $chr
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndy.bed
      echo running modified on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    # Now try Takai & Jones' script too, what the heck!  :)
    # (it took 4 days though, and unless I modify it, no CpG count...:b).
    cp /dev/null cpgIslandTJUnlifted.bed
    foreach d (../../*/chr*_?{,?})
      set ctg = $d:t
      /cluster/home/angie/cpgi130.pl $d/$ctg.fa \
      | tail +6 \
      | perl -wpe 'if (/CpG island (\d+), start=(\d+), end=(\d+), %GC=([\d\.]+), ObsCpG\/ExpCpG=([\d\.]+), Length=(\d+)/) { \
                     ($id, $s, $e, $pGc, $oE, $n) = ($1, $2-1, $3,$4,$5,$6); \
                     $cpg = 0;  $gc = int($pGc * $n / 100.0); \
                     $pCpG = 0; \
                     $_ = "'$ctg'\t$s\t$e\tcpg$id\t$n\t$cpg\t$gc\t" . \
                          "$pCpG\t$pGc\t$oE\n"; \
                   } else { die "Cant parse this:\n$_\n"; }' \
      >> cpgIslandTJUnlifted.bed
    end
    liftUp cpgIslandTJ.bed ../../jkStuff/liftAll.lft warn \
      cpgIslandTJUnlifted.bed
    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    # this one is a bed 4:
    hgLoadBed galGal2 cpgIAndy -tab -noBin cpgIslandAndy.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed galGal2 cpgIslandGgfAndy -tab -noBin \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    # likewise for perl'd output of Takai & Jones:
    sed -e 's/cpgIslandExt/cpgIslandTJ/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandTJ.sql
    hgLoadBed galGal2 cpgIslandTJ -tab -noBin \
      -sqlTable=cpgIslandTJ.sql cpgIslandTJ.bed
    featureBits galGal2 cpgIslandExt
#15097911 bases of 1054197620 (1.432%) in intersection
    featureBits galGal2 cpgIAndy
#84546678 bases of 1054197620 (8.020%) in intersection
    featureBits galGal2 cpgIslandGgfAndy
#60782698 bases of 1054197620 (5.766%) in intersection
    featureBits galGal2 cpgIslandTJ
#27489160 bases of 1054197620 (2.608%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  21802 ../cpgIsland/cpgIsland.bed
#  86247 cpgIslandAndy.bed
#  73381 cpgIslandGgfAndy.bed
#  25788 cpgIslandTJ.bed
    # OK, just for fun let's run ggf-andy script with TJ parameters!
    ssh kksilo
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo running modified w/TJ on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl \
         -min-length 500 -percent-gc 55 -obs-over-exp 0.65 \
         $chr.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyTJ.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyTJ/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyTJ.sql
    hgLoadBed galGal2 cpgIslandGgfAndyTJ -tab -noBin \
      -sqlTable=cpgIslandGgfAndyTJ.sql cpgIslandGgfAndyTJ.bed
    featureBits galGal2 cpgIslandGgfAndyTJ
#32146870 bases of 1054197620 (3.049%) in intersection
    wc -l cpgIslandGgfAndyTJ.bed
#  23685 cpgIslandGgfAndyTJ.bed
    # LaDeana asked for a run on masked seq for comparison.
    cp /dev/null cpgIslandAndyMasked.bed
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../*/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc masked $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
      echo running original on $chr masked
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndyMasked.bed
      echo running modified on $chr masked
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    hgLoadBed galGal2 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed galGal2 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits galGal2 cpgIAndyMasked
#67353879 bases of 1054197620 (6.389%) in intersection
    featureBits galGal2 cpgIslandGgfAndyMasked
#47911662 bases of 1054197620 (4.545%) in intersection
    wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed
#  84570 cpgIslandAndyMasked.bed
#  70655 cpgIslandGgfAndyMasked.bed
    # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
    # might not be the right thing to do (?).  Give it a try with less-masked 
    # sequence.
    ssh kksilo
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyOnlyRM.bed
    foreach f (../../*/chr*.fa)
      set chr = $f:t:r
      echo preproc, ggf-andy $chr onlyRM
      maskOutFa $f $f.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRM.bed
    end
#  70694 cpgIslandGgfAndyOnlyRM.bed
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed galGal2 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRM.bed
    featureBits galGal2 cpgIslandGgfAndyOnlyRM.bed
#48007046 bases of 1054197620 (4.554%) in intersection
    # 1/26/05: Make better island names in cpgIslandGgfAndyMasked,
    # for Dave Burt's cross-species island comparisons.
    ssh kksilo
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    mv cpgIslandGgfAndyMasked.bed cpgIslandGgfAndyMasked.bed.orig
    perl -wpe '@w=split("\t"); $w[3] = "galGal2.$w[0]." . ($w[1]+1) . ".$w[2]"; \
               $_ = join("\t", @w);' \
      cpgIslandGgfAndyMasked.bed.orig \
    > cpgIslandGgfAndyMasked.bed
    ssh hgwdev
    cd /cluster/data/galGal2/bed/cpgIslandGgfAndy
    hgLoadBed -noBin -tab -sqlTable=cpgIslandGgfAndyMasked.sql \
      galGal2 cpgIslandGgfAndyMasked cpgIslandGgfAndyMasked.bed


# RELOAD ENSEMBL GENES (DONE 6/16/04 angie - REDONE 1/10/05 angie)
    # previous set was sent out to chicken group -- pull in the released 
    # version from Ensembl's main site.  Looks like 75 transcripts have 
    # been yanked, will ask Steve Searle and Ewan.  
    mkdir /cluster/data/galGal2/bed/ensembl
    cd /cluster/data/galGal2/bed/ensembl
    # Get the ensembl gene data from 
    # http://www.ensembl.org/Gallus_gallus/martview
    # Follow this sequence through the pages:
    # Page 1) Make sure that the Gallus_gallus choice is selected. Hit next.
    # Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
    # Page 3) Choose the "Structures" box. 
    # Page 4) Choose GTF as the ouput.  choose gzip compression.  hit export.
    # Save as ensembl.gff.gz
    # Add "chr" to front of each line in the gene data gtf file to make 
    # it compatible with our software.
    gunzip -c ensembl.gff.gz \
    | perl -wpe 's/^([0-9]+|E\w+|[ZW]|Un(_random)?)/chr$1/ \
                 || die "Line $. doesnt start with chicken chrom:\n$_"' \
    > ensGene.gtf
    ssh hgwdev
    ldHgGene -gtf -genePredExt galGal2 ensGene \
      /cluster/data/galGal2/bed/ensembl/ensGene.gtf

    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql galGal2 < ~/kent/src/hg/lib/ensGtp.sql
    hgsql galGal2 -e 'load data local infile "ensGtp.txt" into table ensGtp'

    # Load Ensembl peptides:
    # Get them from ensembl as above in the gene section except for
    # Page 3) Choose the "Sequences" box. 
    # Page 4) Transcripts/Proteins.  Peptide.  Format = FASTA.
    # Save file as ensemblPep.fa.gz
    gunzip -c ensemblPep.fa.gz \
    | perl -wpe 's/^>ENSGALG\w+\|(ENSGALT\d+\.\d+).*/>$1/' \
    > ensPep.fa
    hgPepPred galGal2 generic ensPep ensPep.fa


# ECORES FROM GENOSCOPE (DONE, hartera, 2004-08-09)
# download data from  http://www.genoscope.cns.fr/externe/4ucsc/
# ecotigCT - ecores on chicken genome which are conserved with Tetraodon

    ssh hgwdev
    mkdir -p /cluster/data/galGal2/bed/ecores
    # these are in a different format to the previous Ecores
    # Create a script here to parse the Ecores into bed format
    # add parse_ecotig.pl to this directory 
    cp /cluster/data/rn3/bed/ecores/parse_ecotig.pl \
              /cluster/data/galGal2/bed/ecores/

    # ecores conserved with Tetraodon
    mkdir -p /cluster/data/galGal2/bed/ecores/tetraodon
    cd /cluster/data/galGal2/bed/ecores/tetraodon

    # download data for ecotigRF to this directory
    wget -o fetch.log -r -l1 --no-directories --timestamp \
       http://www.genoscope.cns.fr/externe/4ucsc/ecotigCT
    # The format is like those for ecores on Drosophila conserved with Anopheles    # but with a species code missing
    # [chr] [start ecore] [end ecore] [chr] [start ecore] [end ecore]
    # Reformat chromosome names to match those in our database
    sed -e 's/RANDOM/_random/g' ecotigCT | sed -e 's/UN/Un/g;' > ecotigCT.format
    # get list of chromosomes represented
    awk '{print $1}' tmp | sort -n | uniq > chrom.lst
    # compare chroms to those in galGal2: E64 and chr6_random are missing
    # use perl script to process this to the same format at those
    # for rn3, hg16 and mm3
    cd ..
cat << 'EOF_' > /cluster/data/galGal2/bed/ecores/processEcotigs.pl
#!/usr/bin/perl -w
use strict;
# convert ecores file to bed format
# Format of Ecores file:
# [chr] [start ecore] [end ecore] [chr] [start ecore] [end ecore]
# if the second set of ecores are the same as those at the beginning of the
# next line then they all belong to the same ecotig
# Process so all ecores from the same ecotig are on one line as for the other
# format.
my $beg_start = 0;
my $beg_end = 0;
my $last_start = 0;
my $last_end = 0;
my $chr_old = "";
my $count = 0;
                                                                                
while (<STDIN>) {
   chomp;
   # remove trailing space
   s/\s+$//;
   my @fields = split(/\s+/);
   my $chr = $fields[0];
   $beg_start = $fields[1];
   $beg_end = $fields[2];
   # if first two co-ordinates are the same as those on the previous line
   if (($beg_start == $last_start) && ($beg_end == $last_end) && ($chr eq $chr_old)) {
      # get new last start and end from this line
      print "\t$last_start\t$last_end";
      $count++;
   }
   else {
      if ($count > 0) {
         print "\t$last_start\t$last_end";
         $count = 0;
      }
                                                                                
      print "\nchr$chr\t$beg_start\t$beg_end";
   }
   $chr_old = $chr;
   if ($#fields == 5) {
      $last_start = $fields[$#fields - 1];
      $last_end = $fields[$#fields];
   }
}
'_EOF_'
    # << this line makes emacs coloring happy

    # convert data to bed file format 
    # copy parse_ecotigs.pl from rn3
    cp /cluster/data/rn3/bed/ecores/parse_ecotig.pl .
    cd tetraodon
    perl ../processEcotigs.pl < ecotigCT.format > ecotigCT.format2
    perl ../parse_ecotig.pl < ecotigCT.format2 > ecotigCT.bed
    hgLoadBed -tab galGal2 ecoresTetraodon ecotigCT.bed


# SWAP BLASTZ MOUSE-CHICKEN TO CHICKEN-MOUSE (MM5) (DONE 8/18/04 angie)
    # mm5-galGal2 alignments were already axtRescored, no need to re-rescore.
    ssh kolossus
    mkdir /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15
    set aliDir = /cluster/data/mm5/bed/blastz.galGal2.2004-07-15
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
#1.3G    /cluster/data/mm5/bed/blastz.galGal2.2004-07-15/axtChrom
#1.3G    unsorted
#1.3G    axtChrom
    rm -r unsorted


# CHAIN MOUSE BLASTZ (DONE 8/20/04 angie)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChrom/*.axt \
      > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out exists out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh -ef
axtChain -scoreScheme=/cluster/data/blastz/HoxD55.q \
           -linearGap=/cluster/data/blastz/chickenHumanTuned.gap \
           -minScore=5000 $1 \
  /iscratch/i/galGal2/nib \
  /iscratch/i/mus/mm5/softNib $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check, push, check...
#Completed: 52 of 52 jobs
#Average job time:                  78s       1.31m     0.02h    0.00d
#Longest job:                      391s       6.52m     0.11h    0.00d
#Submission to last job:           391s       6.52m     0.11h    0.00d

    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r
      textHistogram -binSize=10000 /tmp/score.$f:t:r
      echo ""
    end

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain/chain
    foreach i (*.chain)
        set c = $i:r
        echo loading $c
        hgLoadChain galGal2 ${c}_chainMm5 $i
    end


# NET MOUSE BLASTZ (DONE 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin -minSpace=1 ../S1.len ../S2.len stdout /dev/null \
    | netSyntenic stdin noClass.net

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    netClass noClass.net galGal2 mm5 mouse.net

    # Make a 'syntenic' subset:
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    rm noClass.net
    # Make a 'syntenic' subset of these with
    netFilter -syn mouse.net > mouseSyn.net

    # Load the nets into database 
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    netFilter -minGap=10 mouse.net |  hgLoadNet galGal2 netMm5 stdin
    netFilter -minGap=10 mouseSyn.net | hgLoadNet galGal2 netSyntenyMm5 stdin
    # Add entries for chaing16, netMm5, syntenyMm5 to chicken/galGal2 trackDb


# GENERATE MM5 MAF FOR MULTIZ FROM NET (DONE 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.mm5/axtChain
    netSplit mouse.net net
    cd /cluster/data/galGal2/bed/blastz.mm5
    mkdir axtNet
    foreach f (axtChain/net/*)
      set chr = $f:t:r
      netToAxt $f axtChain/chain/$chr.chain /cluster/data/galGal2/nib \
        /cluster/data/mm5/nib stdout \
      | axtSort stdin axtNet/$chr.axt
    end
    mkdir mafNet
    foreach f (axtNet/chr*.axt)
      set maf = mafNet/$f:t:r.maf
      axtToMaf $f \
            /cluster/data/galGal2/chrom.sizes /cluster/data/mm5/chrom.sizes \
            $maf -tPrefix=galGal2. -qPrefix=mm5.
    end


# MAKE VSMM5 DOWNLOADABLES (TODO 8/20/04 angie)
    ssh kksilo
    cd /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtChain
    gzip -c all.chain > /cluster/data/galGal2/zip/mouse.chain.gz
    gzip -c mouse.net > /cluster/data/galGal2/zip/mouse.net.gz

    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsMm5
    cd /usr/local/apache/htdocs/goldenPath/galGal2/vsMm5
    mv /cluster/data/galGal2/zip/mouse*.gz .
    mkdir axtNet
    cp -p \
     /cluster/data/galGal2/bed/blastz.mm5.swap.2004-07-15/axtNet/chr*.axt.gz \
     axtNet/
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# VAR_MULTIZ 3-WAY (DONE 8/23/04 angie)
    # Copy inputs to bluearc
    ssh kksilo
    mkdir /cluster/bluearc/var_multiz.galGal2{,/hg17,/mm5}
    cp -p /cluster/data/galGal2/bed/blastz.hg17/mafNet/* \
      /cluster/bluearc/var_multiz.galGal2/hg17
    cp -p /cluster/data/galGal2/bed/blastz.mm5/mafNet/* \
      /cluster/bluearc/var_multiz.galGal2/mm5
    # Small cluster job:
    ssh kki
    mkdir /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23
    cat > doMultiz.csh << '_EOF_'
#!/bin/csh -ef
set path = (/cluster/bin/penn/var_multiz.2004.08.12 $path)
if (-e $1 && -e $2) then
  set chr = $1:t:r
  set tmp = /scratch/$chr.tmp.maf
  var_multiz $1 $2 0 0 > $tmp
  maf_project $tmp galGal2.$chr > $3
  rm $tmp
else if (-e $1) then
  cp $1 $3
else if (-e $2) then
  cp $2 $3
endif
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doMultiz.csh
    cp /dev/null jobList
    foreach chr (`awk '{print $1;}' /cluster/data/galGal2/chrom.sizes`)
      set f1 = /cluster/bluearc/var_multiz.galGal2/hg17/$chr.maf
      set f2 = /cluster/bluearc/var_multiz.galGal2/mm5/$chr.maf
      echo "doMultiz.csh $f1 $f2 ghm/$chr.maf" >> jobList
    end
    mkdir ghm
    para create jobList
    para try, check, push, check, ...
#Completed: 54 of 54 jobs
#Average job time:                  30s       0.49m     0.01h    0.00d
#Longest job:                      190s       3.17m     0.05h    0.00d
#Submission to last job:           208s       3.47m     0.06h    0.00d

    # Clean up.
    rm -r /cluster/bluearc/var_multiz.galGal2

    # setup external files for database reference
    ssh hgwdev
    cd /tmp
    mkdir /gbdb/galGal2/multizHg17Mm5{,/maf}
    ln -s /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/ghm/chr*.maf \
      /gbdb/galGal2/multizHg17Mm5/maf/
    # load multiz maf
    hgLoadMaf -pathPrefix=/gbdb/galGal2/multizHg17Mm5/maf -warn \
      galGal2 multizHg17Mm5
    # load pairwise mafs
    mkdir /gbdb/galGal2/{human,mouse}_ghm
    ln -s /cluster/data/galGal2/bed/blastz.hg17/mafNet/*.maf \
      /gbdb/galGal2/human_ghm/
    ln -s /cluster/data/galGal2/bed/blastz.mm5/mafNet/*.maf \
      /gbdb/galGal2/mouse_ghm/
    hgLoadMaf -WARN galGal2 human_ghm
    hgLoadMaf -WARN galGal2 mouse_ghm


# PHASTCONS CHICKEN/HUMAN/MOUSE (DONE 8/24/04 angie)
# NOTE: REPLACED, SEE "PHASTCONS 7WAY" BELOW
    # Make .ss input for phastCons
    ssh kksilo
    mkdir /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    set WINSIZE=1000000
    set WINOVERLAP=0
    mkdir SS
    foreach f (../ghm/*.maf)
      set chr=$f:t:r
      set c=`echo $chr | sed 's/chr//; s/_random//;'`
      echo $f $chr $c
      mkdir SS/$chr
      /cluster/bin/phast/msa_split $f -i MAF -o SS \
        -O galGal2,hg17,mm5 -M /cluster/data/galGal2/$c/$chr.fa \
        -w ${WINSIZE},${WINOVERLAP} -r SS/$chr/$chr -I 1000 -d 1 -B 5000
    end
    # save some space and I/O time:
    gzip SS/*/*.ss
    # put on bluearc for cluster runs:
    mv SS /cluster/bluearc/galGal2.SS

    # Run phastCons twice:
    # - more stringent parameters to get postProbs for wiggle
    # - less stringent params (more coverage) to get conserved elements.
    ssh kk9
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    # Pare down hg17 8-way tree:
    /cluster/bin/phast/tree_doctor --rename '1->hg17; 3->mm5; 6->galGal2' \
      /cluster/data/hg17/bed/multiz8way/cons/run.cons/hg17panTro1mm5rn3canFam1galGal2fr1danRer1.mod \
    | /cluster/bin/phast/tree_doctor --prune-all-but galGal2,hg17,mm5 - \
    | perl -wpe 's/TREE: \((\(hg17:\S+),(galGal2:\S+)\)/TREE: ($2,$1)/' \
    > rev-dg.mod
    # More stringent: same --transitions and --nrates as human 8-way.
    cat << '_EOF_' > doPostProbs
#!/bin/csh -fe

set PHAST=/cluster/bin/phast
set TMP=/scratch/phastCons/ghm

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --nrates 40 --transitions 0.08,0.008 --quiet \
  > $TMP/$root.pp
mkdir -p POSTPROBS/$chrom
gzip -c $TMP/$root.pp > POSTPROBS/$chrom/$root.pp.gz
rm $TMP/$root.pp
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doPostProbs
    rm -f jobListPostProbs
    foreach f (`ls -1S /cluster/bluearc/galGal2.SS/*/*.ss.gz`)
      echo './doPostProbs {check in exists+ '$f'}' \
      >> jobListPostProbs
    end
    para create jobListPostProbs 
    para try ; para push ; etc....
#Completed: 970 of 970 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest job:                       16s       0.27m     0.00h    0.00d
#Submission to last job:           122s       2.03m     0.03h    0.00d

    # now create wiggle track
    ssh kksilo
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    mkdir wib
    foreach dir (POSTPROBS/*)
        echo $dir
        set chr=$dir:t
        zcat `ls -1 $dir/*.pp.gz | sort -t\. -k2,2n` | \
            wigAsciiToBinary -chrom=$chr \
            -wibFile=wib/${chr}_phastCons stdin
    end
    ssh hgwdev
    mkdir -p /gbdb/galGal2/multizHg17Mm5/wib
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/galGal2/multizHg17Mm5/wib/
    hgLoadWiggle galGal2 mzHg17Mm5_phast_wig \
      -pathPrefix=/gbdb/galGal2/multizHg17Mm5/wib wib/*.wig

    # Less stringent run: same --transitions as human 8-way, but smaller 
    # nrates (fewer categories ==> more frequently in top category)
    ssh kk9
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    cat << '_EOF_' > doEls
#!/bin/csh -fe

set PHAST=/cluster/bin/phast
set TMP=/scratch/phastCons/ghm

set file=$1
set root = $file:t:r:r
set chrom = $root:r

mkdir -p $TMP
mkdir -p PREDICTIONS/$chrom
cp $file $TMP/$root.ss.gz
zcat $TMP/$root.ss.gz \
| $PHAST/phastCons - rev-dg.mod --nrates 10 --transitions 0.08,0.008 --quiet \
    --viterbi PREDICTIONS/$chrom/$root.bed --score --seqname $chrom \
    --no-post-probs
mkdir -p POSTPROBS/$chrom
rm $TMP/$root.ss.gz
'_EOF_'
    # << this line makes emacs coloring happy
    chmod 775 doEls
    rm -f jobListEls
    foreach f (`ls -1S /cluster/bluearc/galGal2.SS/*/*.ss.gz`)
      echo './doEls {check in exists+ '$f'}' \
      >> jobListEls
    end
    para create jobListEls 
    para try ; para push ; etc....
#Completed: 970 of 970 jobs
#Average job time:                   4s       0.06m     0.00h    0.00d
#Longest job:                        7s       0.12m     0.00h    0.00d
#Submission to last job:            56s       0.93m     0.02h    0.00d

    # Create bed track: tweak scores and names, trim the unused strand column.
    cat PREDICTIONS/*/*.bed \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
    | /cluster/bin/scripts/lodToBedScore \
    > all.bed

    ssh hgwdev
    cd /cluster/data/galGal2/bed/multiz.hg17mm5.2004-08-23/phastCons
    hgLoadBed galGal2 phastConsElements all.bed
    featureBits galGal2 phastConsElements
#34279869 bases of 1054197620 (3.252%) in intersection
    # make top-5000 list and launcher on Adam's home page:
    sort -k5,5nr raw.bed | head -5000 > top5000.bed
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/gg-top5000 \
      "top 5000 conserved elements" galGal2

    # Clean up
    rm -r /cluster/bluearc/galGal2.SS


# MAKE Human Proteins track
    mkdir -p /cluster/data/galGal2/bed/tblastn.hg16KG
    cd /cluster/data/galGal2/bed/tblastn.hg16KG
    ls -1S /iscratch/i/galGal2/blastDb/*.nsq | sed "s/\.nsq//" > chick.lst

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(350000/`wc chick.lst | awk "{print \\\$1}"`\)
    # 41117/(350000/259) = 30.426580
    split -l 300 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
    cd kgfa
    for i in *; do split -l 30 $i $i; rm $i; done
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa
    cp * /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa
    cd ..
    
    ls -1S /cluster/bluearc/galGal2/bed/tblastn.hg16KG/kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -nohead $f.3 /iscratch/i/galGal2/lifts/liftAll.lft warn $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /iscratch/i/hg16/lifts/protein.lft warn $f.3
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 $f.3
	    exit 0
	fi
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 chick.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/galGal2/bed/tblastn.hg16KG
    para create blastSpec
    para push

# Completed: 355088 of 355089 jobs
# CPU time in finished jobs:   43208509s  720141.82m 12002.36h  500.10d  1.370 y
# IO & Wait Time:               2326912s   38781.87m   646.36h   26.93d  0.074 y
# Average job time:                 128s       2.14m     0.04h    0.00d
# Longest job:                    33767s     562.78m     9.38h    0.39d
# Submission to last job:         90026s    1500.43m    25.01h    1.04d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=75000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg???? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/galGal2/bed/tblastn.hg16KG
    para create chainSpec
    para push
# Completed: 1371 of 1371 jobs
# CPU time in finished jobs:     941133s   15685.54m   261.43h   10.89d  0.030 y
# IO & Wait Time:                 11226s     187.11m     3.12h    0.13d  0.000 y
# Average job time:                 695s      11.58m     0.19h    0.01d
# Longest job:                    14841s     247.35m     4.12h    0.17d
# Submission to last job:         23130s     385.50m     6.42h    0.27d

    exit
    # back to kksilo
    cd /cluster/data/galGal2/bed/tblastn.hg16KG/blastOut
    for i in kg????
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastHg16KG.psl
    cd ..
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/galGal2/bed/tblastn.hg16KG
    hgLoadPsl galGal2 blastHg16KG.psl
    exit
    # back to kksilo
    rm -rf blastOut
# End tblastn


# ACCESSIONS FOR CONTIGS (DONE 2004-08-20 kate)
#  Used for Assembly track details, and also ENCODE AGP's
    cd /cluster/data/galGal2/bed
    mkdir -p contigAcc
    cd contigAcc
    
    # extract WGS contig to accession mapping from Entrez Nucleotide summaries
    # To get the summary file, access the Genbank page for the project 
    # by searching:
    #       genus[ORGN] AND WGS[KYWD]
    # At the end of the page, click on the list of accessions for the contigs.
    # Select summary format, and send to file.
    # output to file <species>.acc

    contigAccession chicken.acc > contigAcc.tab
    wc -l contigAcc.tab
        # 111864 contigAcc.tab
    cd /cluster/data/galGal2/agp_040224
     grep Contig *.agp | wc -l
        # 111864
    hgsql galGal2 < $HOME/kent/src/hg/lib/contigAcc.sql
    echo "LOAD DATA LOCAL INFILE 'contigAcc.tab' INTO TABLE contigAcc" | \
        hgsql galGal2
    hgsql galGal2 -e "SELECT COUNT(*) FROM contigAcc"
        # 111864


# Pick up photo from NHGRI (DONE - 2004-12-22 - Hiram)
    ssh hgwdev
    cd /tmp
    wget \
	'http://www.genome.gov/Pages/News/Photos/Science/redfowl_image.jpg'
	-O redfowl_image.jpg
    #	view that image in 'display' to determine crop edges, then:
    convert -crop 600x488+80+32 -quality 80 -sharpen 0 \
	-normalize redfowl_image.jpg ggg.jpg
    convert -geometry 300x200 -quality 80 ggg.jpg Gallus_gallus.jpg

    rm -f ggg.jpg redfowl_image.jpg
    cp -p Gallus_gallus.jpg /usr/local/apache/htdocs/images
    #	add links to this image in the description.html page, request push


# MAKE Human Proteins track (DONE 2005-1-15 braney)
    ssh kksilo
    mkdir -p /cluster/data/galGal2/blastDb
    cd /cluster/data/galGal2/blastDb
    for i in `cat ../chrom.lst`; do ls -1 ../$i/*/*.fa  ; done > list
    for i in `cat list`
    do
	ln -s $i .
	formatdb -i `basename $i` -p F
    done
    rm *.log *.fa list
    # cd ..
    # for i in `cat chrom.lst`; do cat  $i/chr*/*.lft  ; done > jkStuff/subChr.lft

    ssh kkr1u00
    mkdir -p /iscratch/i/galGal2/blastDb
    cd /cluster/data/galGal2/blastDb
    for i in nhr nin nsq; do cp *.$i /iscratch/i/galGal2/blastDb     ; echo $i; done

    cd
    iSync > sync.out

    mkdir -p /cluster/data/galGal2/bed/tblastn.hg17KG
    cd /cluster/data/galGal2/bed/tblastn.hg17KG
    echo  /iscratch/i/galGal2/blastDb/*.nsq  | xargs ls -S | sed "s/\.nsq//"  > query.lst  
    # back to kksilo
    exit

    cd /cluster/data/galGal2/bed/tblastn.hg17KG
    mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa
    split -l 65 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/galGal2/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir -p /cluster/bluearc/galGal2/bed/tblastn.hg17KG/blastOut
    ln -s /cluster/bluearc/galGal2/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
        liftUp -nosort -type=".psl" -nohead $f.3 ../../jkStuff/liftAll.lft carry $f.2       
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.3       
        if pslCheck -prot $3.tmp                                                                          
        then                                                                                              
            mv $3.tmp $3                                                                                  
            rm -f $f.1 $f.2 $f.3
        fi
        exit 0                                                                                            
    fi                                                                                                    
fi                                                                                                        
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3
exit 1
'_EOF_'

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/galGal2/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 168091 of 168091 jobs
# CPU time in finished jobs:   42705613s  711760.22m 11862.67h  494.28d  1.354 y
# IO & Wait Time:               2300980s   38349.66m   639.16h   26.63d  0.073 y
# Average job time:                 268s       4.46m     0.07h    0.00d
# Longest job:                    56699s     944.98m    15.75h    0.66d
# Submission to last job:        171648s    2860.80m    47.68h    1.99d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=200000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/galGal2/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 649 of 649 jobs
# CPU time in finished jobs:     243653s    4060.88m    67.68h    2.82d  0.008 y
# IO & Wait Time:                 50013s     833.55m    13.89h    0.58d  0.002 y
# Average job time:                 452s       7.54m     0.13h    0.01d
# Longest job:                     3831s      63.85m     1.06h    0.04d
# Submission to last job:         24525s     408.75m     6.81h    0.28d

    exit
    # back to kksilo
    cd /cluster/data/galGal2/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > cs60.$i.psl
	sort -rn cs60.$i.psl | pslUniq stdin us.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" cs60.$i.psl > ms60.$i.psl
	echo $i
    done

    cat us.*.psl ms60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq  > /cluster/data/galGal2/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/galGal2/bed/tblastn.hg17KG
    hgLoadPsl galGal2 blastHg17KG.psl

    # back to kksilo
    rm -rf blastOut
# End tblastn


# MAKE Human Known Gene mRNAs Mapped by BLATZ track (hg17 DONE 1/18/05 braney)
    # Create working directory and extract known gene mrna from database
    ssh hgwdev
    mkdir -p /cluster/data/galGal2/bed/blatz.hg17KG
    cd /cluster/data/galGal2/bed/blatz.hg17KG
    pepPredToFa hg17 knownGeneMrna known.fa

    # Split known genes into pieces and load on /iscratch/i temp device
    ssh kkr1u00
    cd /iscratch/i/hg17
    mkdir knownRna2
    cd knownRna2
    faSplit sequence /cluster/data/galGal2/bed/blatz.hg17KG/known.fa 580 known
    iSync
    rm /cluster/data/galGal2/bed/blatz.hg17KG/known.fa

    # Do parasol job to align via blatz
    ssh kk
    cd /cluster/data/galGal2/bed/blatz.hg17KG
    mkdir run0
    cd run0
    mkdir psl
    # awk '{print $1;}' /cluster/data/galGal2/chrom.sizes > genome.lst
    ls -1S /iscratch/i/galGal2/trfFa/*.fa > genome.lst
    for i in `cat genome.lst` ;  do f=`basename $i .fa`; mkdir psl/$f ; done 
    for i in  /iscratch/i/hg17/knownRna2/*.fa ; do echo $i; done > mrna.lst
    cat << '_EOF_' > doBlatz
#!/bin/sh
f=/tmp/`basename $1`.`basename $2`
if blatz -rna -minScore=6000 -out=psl $1 $2 $f.1
then
    if liftUp -nosort -type=".psl" -nohead $f.2 ../../../jkStuff/liftAll.lft carry $f.1
    then
	mv $f.2 $3
	rm $f.1
	exit 0
    fi
fi
rm $f.1 $f.2
exit 1
'_EOF_'
    chmod +x doBlatz
    cat << '_EOF_' > gsub
#LOOP
doBlatz   $(path1) $(path2) {check out line psl/$(root1)/$(root2).psl}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    # Then do usual para try/para push/para time
# Completed: 148407 of 148407 jobs
# CPU time in finished jobs:    1884903s   31415.06m   523.58h   21.82d  0.060 y
# IO & Wait Time:                490225s    8170.41m   136.17h    5.67d  0.016 y
# Average job time:                  16s       0.27m     0.00h    0.00d
# Longest job:                      116s       1.93m     0.03h    0.00d
# Submission to last job:         28248s     470.80m     7.85h    0.33d

    # Do sort and near-besting on file server
    ssh kksilo
    cd /cluster/data/galGal2/bed/blatz.hg17KG/run0
    catDir -r psl | pslFilter -minScore=100 -minAli=255 -noHead stdin stdout | sort -k 10 > ../filtered.psl
    cd ..
    pslReps filtered.psl nearBest.psl /dev/null -nohead -minAli=0.5 -nearTop=0.01 -minCover=0.255
    sort -k 14 nearBest.psl > blatzHg17KG.psl

    # Clean up
    rm -r run0/psl 

    # Load into database
    ssh hgwdev
    cd /cluster/data/galGal2/bed/blatz.hg17KG
    hgLoadPsl galGal2 blatzHg17KG.psl 


# MAKE GALGAL2-HG17 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/galGal2/bed/blastz.hg17/axtChain
    mkdir -p /cluster/data/galGal2/bed/bedOver
    netChainSubset $chainDir/human.net $chainDir/all.chain \
      /cluster/data/galGal2/bed/bedOver/galGal2ToHg17.over.chain
    

# MAKE GALGAL2-MM5 OVER.CHAIN FOR LIFTOVER  (DONE 1/25/05 angie)
    ssh kolossus
    set chainDir = /cluster/data/galGal2/bed/blastz.mm5/axtChain
    mkdir -p /cluster/data/galGal2/bed/bedOver
    netChainSubset $chainDir/mouse.net $chainDir/all.chain \
      /cluster/data/galGal2/bed/bedOver/galGal2ToMm5.over.chain
    

# BLASTZ RAT (RN3) (IN PROGRESS 1/26/05 angie)
    ssh kk
    mkdir /cluster/data/galGal2/bed/blastz.rn3.2005-01-26
    ln -s blastz.rn3.2005-01-26 /cluster/data/galGal2/bed/blastz.rn3
    cd /cluster/data/galGal2/bed/blastz.rn3.2005-01-26
    # Use human-chicken params: set L=10000 (higher threshold on blastz's 
    # outer loop) and abridge repeats.
    cat << '_EOF_' > DEF
# chicken vs. rat
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/bin/penn:/cluster/bin/i386:/cluster/home/angie/schwartzbin

ALIGN=blastz-run
BLASTZ=blastz

# Specific settings for chicken (per Webb email to Brian Raney)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=10000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q
BLASTZ_ABRIDGE_REPEATS=1

# TARGET: Chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_SMSK=/iscratch/i/galGal2/linSpecRep
SEQ1_IN_CONTIGS=0
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY: Chicken
SEQ2_DIR=/iscratch/i/rn3/bothMaskedNibs
SEQ2_SMSK=/iscratch/i/rn3/linSpecRep.notInChicken
SEQ2_IN_CONTIGS=0
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/data/galGal2/bed/blastz.rn3.2005-01-26

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len
'_EOF_'
    # << this line keeps emacs coloring happy

    # Partition chroms into 10MB (+10k overlap) chunks and generate parasol 
    # job list.
    cp /cluster/data/galGal2/chrom.sizes S1.len
    cp /cluster/data/rn3/chrom.sizes S2.len
    mkdir run.blastz
    cd run.blastz
    partitionSequence.pl 10000000 10000 /iscratch/i/galGal2/nib ../S1.len \
      -xdir xdir.sh -rawDir ../lav > galGal2.lst
    partitionSequence.pl 10000000 0 /iscratch/i/rn3/bothMaskedNibs ../S2.len \
      > rn3.lst
    csh -ef xdir.sh
    cat << '_EOF_' > gsub
#LOOP
/cluster/home/angie/kent/src/utils/blastz-run-ucsc $(path1) $(path2) ../DEF {check out line ../lav/$(file1)/$(file1)_$(file2).out}
#ENDLOOP
'_EOF_'
    # << this line keeps emacs coloring happy
    gensub2 galGal2.lst rn3.lst gsub jobList
    para create jobList
    para try, check, push, check, ...


# HUMAN RECIPROCAL-BEST NET FOR STRINGENT LIFTOVER (DONE 2/3/05 angie)
    ssh kolossus
    cd /cluster/data/galGal2/bed/blastz.hg17/axtChain
    # Run chainNet again, this time keeping both of its outputs:
    chainPreNet all.chain ../S1.len ../S2.len stdout \
    | chainNet stdin ../S1.len ../S2.len g_h.net h_g.net
    # Get the human chains from the human-referenced (but chicken-centric) net:
    chainSwap all.chain h_g.chain
    netChainSubset h_g.net h_g.chain stdout \
    | chainSort stdin h_g.subset.chain
    # Net those (sorted) human chains, and keep both outputs, to get 
    # reciprocal best nets referenced to both species:
    chainPreNet h_g.subset.chain ../S2.len ../S1.len stdout \
    | chainNet stdin ../S2.len ../S1.len h_g.rbest.net g_h.rbest.net
    # Get the chains from the recip-best nets for stringent liftOver:
    netChainSubset h_g.rbest.net h_g.chain hg17ToGalGal2.rbest.over.chain
    netChainSubset g_h.rbest.net all.chain galGal2ToHg17.rbest.over.chain


# BLASTZ/CHAIN/NET TETRAODON (DONE 4/25/05 angie)
    mkdir /cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25
    cd /cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25
    cat << '_EOF_' > DEF
# Chicken vs. Tetraodon

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - Chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/data/galGal2/chrom.sizes

# QUERY - Tetraodon
SEQ2_DIR=/iscratch/i/tetNig1/tetNig1.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/tetNig1/chrom.sizes

BASE=/cluster/data/galGal2/bed/blastz.tetNig1.2005-04-25
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/galGal2tetNig1 >& do.log &
    tail -f do.log
    rmdir /santest/scratch/galGal2tetNig1
    ln -s blastz.tetNig1.2005-04-25 /cluster/data/galGal2/bed/blastz.tetNig1
    # Add {chain,net}TetNig1 to trackDb.ra if necessary.
    featureBits galGal2 chainTetNig1Link
#45071616 bases of 1054197620 (4.275%) in intersection


# BLASTZ/CHAIN/NET ZEBRAFISH (DONE 4/27/05 angie)
    ssh hgwdev
    # fileserver woes -- put the build dir on bluearc, then copy back 
    # to /cluster/store-land when things are better.  
    mkdir -p /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26
    ln -s /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 \
      /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26
    cp /cluster/data/galGal2/chrom.sizes /cluster/bluearc/galGal2/
    cd /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26
    cat << '_EOF_' > DEF
# Chicken vs. Zebrafish

BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET - Chicken
SEQ1_DIR=/iscratch/i/galGal2/nib
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LEN=/cluster/bluearc/galGal2/chrom.sizes

# QUERY - Zebrafish
SEQ2_DIR=/iscratch/i/danRer2/danRer2.2bit
SEQ2_CHUNK=10000000
SEQ2_LAP=10000
SEQ2_LEN=/cluster/data/danRer2/chrom.sizes

BASE=/cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26
'_EOF_'
    # << this line keeps emacs coloring happy
    doBlastzChainNet.pl DEF \
      -blastzOutRoot /santest/scratch/galGal2danRer2 >& do.log &
    tail -f do.log
    ln -s blastz.danRer2.2005-04-26 /cluster/data/galGal2/bed/blastz.danRer2
    # Add {chain,net}DanRer2 to trackDb.ra if necessary.
    featureBits galGal2 chainDanRer2Link
#60589750 bases of 1054197620 (5.747%) in intersection
    # Move the run dir from bluearc to /cluster/store.
    ssh kolossus  # login not allowed on kkusr01, current home of galGal2
    rm /cluster/data/galGal2/bed/blastz.danRer2.2005-04-26
    rsync -av /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26 \
      /cluster/data/galGal2/bed/
    rm -rf /cluster/bluearc/angie/galGal2/blastz.danRer2.2005-04-26


# SET UP OPOSSUM AND FROG ALIGNMENTS FOR MULTIZ (DONE 4/27/05 angie)
    # possum and frog alignments were put in non-std dir locations and 
    # need maf...
    ssh kolossus
    ln -s /cluster/data/xenTro1/bed/zb.galGal2 \
      /cluster/data/galGal2/bed/blastz.xenTro1
    cd /cluster/data/galGal2/bed/blastz.xenTro1
    mkdir mafNet
    foreach f (axtNet/*.axt)
      echo $f:t:r
      axtSort $f stdout \
      | axtToMaf -tPrefix=galGal2. -qPrefix=xenTro1. \
         stdin /cluster/data/{galGal2,xenTro1}/chrom.sizes stdout \
      | gzip -c > mafNet/$f:t:r.maf.gz
    end
    ln -s /cluster/data/monDom1/bed/zb.galGal2 \
      /cluster/data/galGal2/bed/blastz.monDom1
    cd /cluster/data/galGal2/bed/blastz.monDom1
    mkdir mafNet
    foreach f (axtNet/*.axt.gz)
      echo $f:t:r:r
      gunzip -c $f | axtSort stdin stdout \
      | axtToMaf -tPrefix=galGal2. -qPrefix=monDom1. \
          stdin /cluster/data/{galGal2,monDom1}/chrom.sizes stdout \
      | gzip -c > mafNet/$f:t:r:r.maf.gz
    end


# MAKE VSXENTRO1 DOWNLOADABLES (DONE 5/2/05 angie)
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/vsXenTro1
    cd /usr/local/apache/htdocs/goldenPath/galGal2/vsXenTro1
    ln -s /cluster/data/galGal2/bed/blastz.xenTro1/axtChain/galGal2.xenTro1.*.gz .
    mkdir axtNet
    foreach f (/cluster/data/galGal2/bed/blastz.xenTro1/axtNet/*.axt.gz)
      ln -s $f axtNet/$f:t:r:r.galGal2.xenTro1.net.axt.gz
    end
    nice md5sum *.gz */*.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.


# MULTIZ CHICKEN/HUMAN/MOUSE/OPOSSUM/FROG/FISHES (DONE 4/27/05 angie)
    # tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1))
    ssh kolossus
    # Put run dir on /santest/scratch for now, to ease load on fileservers:
    mkdir -p /santest/scratch/galGal2/multiz7way.2005-04-27
    cd /santest/scratch/galGal2/multiz7way.2005-04-27
    # Setup: Copy pairwise MAF to /santest/scratch:
    mkdir /santest/scratch/chickenMultiz7way
    foreach db (hg17 mm5 monDom1 xenTro1 danRer2 tetNig1)
      cp -pR /cluster/data/galGal2/bed/blastz.$db/mafNet \
        /santest/scratch/chickenMultiz7way/$db
    end
    # Most but not all of those dirs contain gzip'd maf -- gzip where 
    # necessary to make them all uniform (& less cluster I/O):
    gzip /santest/scratch/chickenMultiz7way/*/*.maf
    # Make output dir:
    mkdir maf
    # Create script to run multiz.v10 in the right order:
    cat << '_EOF_' > doMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/tmp/chickenMultiz7way.$chr
mkdir $tmp

set REF = galGal2.$chr
set HUM = /santest/scratch/chickenMultiz7way/hg17/$chr.maf.gz
set MUS = /santest/scratch/chickenMultiz7way/mm5/$chr.maf.gz
set MON = /santest/scratch/chickenMultiz7way/monDom1/$chr.maf.gz
set XEN = /santest/scratch/chickenMultiz7way/xenTro1/$chr.maf.gz
set DAN = /santest/scratch/chickenMultiz7way/danRer2/$chr.maf.gz
set TET = /santest/scratch/chickenMultiz7way/tetNig1/$chr.maf.gz
set DEST = /santest/scratch/galGal2/multiz7way.2005-04-27/maf/$chr.maf

set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project

# final tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1))

# subtree: ((galGal2 (hg17 mm5))
if ( -s $HUM && -s $MUS ) then 
    echo "Aligning $HUM $MUS..."
    zcat $HUM > $tmp/$chr.Hum.maf
    zcat $MUS > $tmp/$chr.Mus.maf
    $MZ10 $tmp/$chr.Hum.maf $tmp/$chr.Mus.maf 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMus.maf
else if ( -s $HUM ) then
    zcat $HUM > $tmp/$chr.HumMus.maf
else if ( -s $MUS ) then
    zcat $MUS > $tmp/$chr.HumMus.maf
endif

# subtree: (galGal2 ((hg17 mm5) monDom1))
if ( -s $tmp/$chr.HumMus.maf && -s $MON ) then 
    echo "Aligning $tmp/$chr.HumMus.maf $MON..."
    zcat $MON > $tmp/$chr.Mon.maf
    $MZ10 $tmp/$chr.HumMus.maf $tmp/$chr.Mon.maf 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMon.maf
else if ( -s $tmp/$chr.HumMus.maf ) then
    cp $tmp/$chr.HumMus.maf $tmp/$chr.HumMusMon.maf
else if ( -s $MON ) then
    zcat $MON > $tmp/$chr.HumMusMon.maf
endif

# subtree: ((galGal2 ((hg17 mm5) monDom1)) xenTro1)
if ( -s $tmp/$chr.HumMusMon.maf && -s $XEN ) then 
    echo "Aligning $tmp/$chr.HumMusMon.maf $XEN..."
    zcat $XEN > $tmp/$chr.Xen.maf
    $MZ10 $tmp/$chr.HumMusMon.maf $tmp/$chr.Xen.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMonXen.maf
else if ( -s $tmp/$chr.HumMusMon.maf ) then
    cp $tmp/$chr.HumMusMon.maf $tmp/$chr.HumMusMonXen.maf
else if ( -s $XEN ) then
    zcat $XEN > $tmp/$chr.HumMusMonXen.maf
endif

# subtree: (galGal2 (danRer2 tetNig1))
if ( -s $DAN && -s $TET ) then 
    echo "Aligning $DAN $TET..."
    zcat $DAN > $tmp/$chr.Dan.maf
    zcat $TET > $tmp/$chr.Tet.maf
    $MZ10 $tmp/$chr.Dan.maf $tmp/$chr.Tet.maf 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.DanTet.maf
else if ( -s $DAN ) then
    zcat $DAN > $tmp/$chr.DanTet.maf
else if ( -s $TET ) then
    zcat $TET > $tmp/$chr.DanTet.maf
endif


# final tree: (((galGal2 ((hg17 mm5) monDom1)) xenTro1) (danRer2 tetNig1))
if ( -s $tmp/$chr.HumMusMonXen.maf && -s $tmp/$chr.DanTet.maf ) then
    echo "Aligning $tmp/$chr.HumMusMonXen.maf $tmp/$chr.DanTet.maf..."
    $MZ10 $tmp/$chr.HumMusMonXen.maf $tmp/$chr.DanTet.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMusMonXenDanTet.maf
    mv $tmp/$chr.HumMusMonXenDanTet.maf $DEST
else if ( -s $tmp/$chr.DanTet.maf ) then
    cp $tmp/$chr.DanTet.maf $DEST
else if ( -s $tmp/$chr.HumMusMonXen.maf ) then
    cp $tmp/$chr.HumMusMonXen.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doMultiz.csh
    awk '{print "./doMultiz.csh " $1;}' /cluster/data/galGal2/chrom.sizes \
      > jobs.lst
    # Run on small cluster
    ssh kki
    cd /santest/scratch/galGal2/multiz7way.2005-04-27
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 53 of 54 jobs
#Crashed: 1 jobs
#Average job time:                 164s       2.73m     0.05h    0.00d
#Longest finished job:            1268s      21.13m     0.35h    0.01d
#Submission to last job:          1456s      24.27m     0.40h    0.02d
    # The failure was for chr8_random which had no alignments in any 
    # of the species.

    # Copy back over to /cluster/store-land
    rsync -av /santest/scratch/galGal2/multiz7way.2005-04-27 \
      /cluster/data/galGal2/bed
    ln -s /cluster/data/galGal2/bed/multiz7way.2005-04-27 \
      /cluster/data/galGal2/bed/multiz7way

    # make /gbdb/ links to 7way maf files:
    ssh hgwdev
    mkdir -p /gbdb/galGal2/multiz7way/maf/multiz7way
    ln -s /cluster/data/galGal2/bed/multiz7way/maf/chr*.maf \
      /gbdb/galGal2/multiz7way/maf/multiz7way/

    # load into database
    cd /tmp
    hgLoadMaf -warn galGal2 multiz7way \
      -pathPrefix=/gbdb/galGal2/multiz7way/maf/multiz7way

    # put 7way MAF out for download
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz7way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    nice md5sum *.maf.gz > md5sum.txt
    # make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/multiz7way
    ln -s /cluster/data/galGal2/bed/multiz7way/mafDownload/{*.maf.gz,md5sum.txt,README.txt} \
      /usr/local/apache/htdocs/goldenPath/galGal2/multiz7way

    # Cleanup
    rm -rf /santest/scratch/galGal2/multiz7way.2005-04-27
    rm -rf /santest/scratch/chickenMultiz7way/


# PHASTCONS 7WAY WITH METHODS FROM PAPER (DONE 4/28/05 angie)
# NOTE FROM QA, 5/12/08: the phastConsElements7way track has an overlap-by-one
# error at chr1:92,640,085-92,640,105 . This was found when investigating a
# similar bug reported by user Michael Hiller in dm3 phastCons data.  As
# galGal2 has been superceded by galGal3, this track is going to be left as-is.

    # Using the param estimation methods described in Adam's 
    # Genome Research paper...  but what to use for CDS coverage metric???
    # tree: (((galGal2,((hg17,mm5),monDom1)),xenTro1),(danRer2,tetNig1))
    ssh kolossus
    mkdir -p /santest/scratch/galGal2/chrom
    cp -p /cluster/data/galGal2/*/chr*.fa /santest/scratch/galGal2/chrom/
    mkdir /santest/scratch/galGal2/maf7way
    cp -p /cluster/data/galGal2/bed/multiz7way/maf/*.maf \
      /santest/scratch/galGal2/maf7way
    # Split chrom fa & maf into smaller windows for phastCons:
    ssh kki
    mkdir /cluster/data/galGal2/bed/multiz7way/phastCons
    mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.split
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.split
    set WINDOWS = /santest/scratch/galGal2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/santest/scratch/galGal2/chrom
set WINDOWS=/santest/scratch/galGal2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/santest/scratch/galGal2/maf7way/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 52 of 53 jobs
#Crashed: 1 jobs
#Average job time:                  32s       0.53m     0.01h    0.00d
#Longest finished job:             204s       3.40m     0.06h    0.00d
#Submission to last job:           506s       8.43m     0.14h    0.01d
    # 1 failure because there wasn't enough info in chr6_random (just one 
    # little alignment to monDom1 only), OK.
#WARNING: skipping partition 1; insufficient informative sites.

    # Get genome-wide all-species average GC content 
    # by running msa_view --aggregate --summary-only on the WINDOWS .SS files.
    # msa_view needs seekable single-ss file, so unpack each to /tmp and 
    # compute overall average:
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/
    cp /dev/null gcs.txt
    # Do this on just a random sample of windows:
    foreach f (`ls -1 /santest/scratch/galGal2/phastCons/WINDOWS/* \
                | randomLines stdin 200 stdout`)
      echo $f:t:r:r
      zcat $f > /tmp/galGal2.sample.ss
      /cluster/bin/phast/msa_view \
        --aggregate galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \
        -i SS --summary-only /tmp/galGal2.sample.ss \
      | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \
      >> gcs.txt
    end
    ave gcs.txt
#average 0.418876
    # OK, so use 0.419 as the --gc parameter below.
    rm /tmp/galGal2.sample.ss

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz7way/phastCons
    /cluster/bin/phast/msa_view -i MAF \
      /santest/scratch/galGal2/maf7way/chr{1,Z_random,32}.maf \
        --aggregate galGal2,hg17,mm5,monDom1,xenTro1,danRer2,tetNig1 \
        > /santest/scratch/galGal2/phastCons/all.ss
    /cluster/bin/phast/phyloFit /santest/scratch/galGal2/phastCons/all.ss \
      --tree "(((galGal2,((hg17,mm5),monDom1)),xenTro1),(danRer2,tetNig1))" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#TREE: (((galGal2:0.219806,((hg17:0.090064,mm5:0.137589):0.090370,monDom1:0.171369):0.112129):0.062872,xenTro1:0.267337):0.086182,(danRer2:0.219771,tetNig1:0.184348):0.086182);

    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # phyloFit makes one .mod but consEntropy wants two...
    # Multiply each subst rate on the TREE line by 3.2 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/hg17/bed/multiz8way/cons/run.elements/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.2 starting-tree.mod \
      > starting-tree.noncons.mod
    # OK, so Adam's initial consEntropy run used 0.265 as the target-cov,
    # but the latest 8way ended up with 0.17.  Hmmm.  I'm going to find 
    # a target-cov that keeps expected-length 12, just for the hell of it,
    # as a starting point.
    /cluster/bin/phast/consEntropy --NH 9.7834 0.189 12 \
      starting-tree*.mod
#( Solving for new omega: 12.000000 12.038661 12.038583 )
#Transition parameters: gamma=0.189000, omega=12.000000, mu=0.083333, nu=0.019420
#Relative entropy: H=1.444808 bits/site
#Required length: N=6.765816 sites
#Total entropy: NH=9.775307 bits
#Recommended expected length: omega=12.038583 sites (for NH=9.783400)

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12 ave.{cons,noncons}.mod
#Recommended expected length: omega=12.334751 sites (for NH=9.783400)
    # OK, --expected-lengths 12.3

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh kk9
    mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.noncons.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.419 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 12 --target-coverage 0.189 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.419 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 12.3 --target-coverage 0.22 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    para create jobList
    para try, check, push, check, ...
#Completed: 1161 of 1161 jobs
#Average job time:                 357s       5.95m     0.10h    0.00d
#Longest finished job:            1435s      23.92m     0.40h    0.02d
#Submission to last job:          1440s      24.00m     0.40h    0.02d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: (((galGal2:0.075819,((hg17:0.038800,mm5:0.063133):0.039437,monDom1:0.074646):0.048105):0.022112,xenTro1:0.134733):0.038043,(danRer2:0.102215,tetNig1:0.105871):0.038043);
#ave.noncons.mod:TREE: (((galGal2:0.249812,((hg17:0.127082,mm5:0.208431):0.130021,monDom1:0.244194):0.158600):0.072560,xenTro1:0.447059):0.127473,(danRer2:0.339178,tetNig1:0.351891):0.127473);
    # SECOND ITERATION:
#ave.cons.mod:TREE: (((galGal2:0.076928,((hg17:0.038920,mm5:0.063282):0.039737,monDom1:0.074819):0.048432):0.022500,xenTro1:0.135616):0.038629,(danRer2:0.103217,tetNig1:0.106871):0.038629);
#ave.noncons.mod:TREE: (((galGal2:0.254336,((hg17:0.128177,mm5:0.210032):0.131589,monDom1:0.246276):0.160281):0.074147,xenTro1:0.451898):0.129916,(danRer2:0.344050,tetNig1:0.356388):0.129916);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off.
    # We probably don't need to do this, since it has always been very close 
    # if not the same as what we used above, but it only takes a second.
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.189 12 ave.{cons,noncons}.mod
#Recommended expected length: omega=10.404849 sites (for NH=9.783400)
    # OK, tweak --expected-lengths to 10.4 below (and for next iteration).
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12.3 ave.{cons,noncons}.mod
#Recommended expected length: omega=12.405521 sites (for NH=9.783400)
    # OK, tweak --expected-lengths to 12.4 below (and for next iteration).

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.phast
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 12.4 --target-coverage 0.22 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /santest/scratch/galGal2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /santest/scratch/galGal2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 1161 of 1161 jobs
#Average job time:                  15s       0.24m     0.00h    0.00d
#Longest finished job:              20s       0.33m     0.01h    0.00d
#Submission to last job:            36s       0.60m     0.01h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/galGal2/bed/multiz7way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /santest/scratch/galGal2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    # But what to use as CDS?  There's a refGene but it's terribly incomplete.
    # There's a human proteins track, but that's only what human and chicken 
    # have in common.  There's a xenoRefGene too... again comparative-
    # influenced, but a lot more complete than refGene presumably.
    # So just go with first cut results for now?
    cd /cluster/data/galGal2/bed/multiz7way/phastCons
    featureBits -enrichment galGal2 refGene:cds all.bed
    featureBits -enrichment galGal2 xenoRefGene:cds all.bed
    featureBits -enrichment galGal2 blastHg17KG all.bed
    # FIRST ITERATION: pretty close!  Let's try to bump the coverage up a bit.
#refGene:cds 0.340%, all.bed 2.831%, both 0.207%, cover 60.76%, enrich 21.46x
#xenoRefGene:cds 1.644%, all.bed 2.831%, both 1.114%, cover 67.76%, enrich 23.93x
#blastHg17KG 2.001%, all.bed 2.831%, both 1.109%, cover 55.44%, enrich 19.58x
    # SECOND ITERATION: was hoping for a bit more, but this will do.
#refGene:cds 0.340%, all.bed 3.008%, both 0.212%, cover 62.27%, enrich 20.70x
#xenoRefGene:cds 1.644%, all.bed 3.008%, both 1.144%, cover 69.55%, enrich 23.12x
#blastHg17KG 2.001%, all.bed 3.008%, both 1.143%, cover 57.14%, enrich 19.00x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed galGal2 phastConsElements7way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/galGal2/bed/multiz7way/phastCons/run.wib
    cd /cluster/data/galGal2/bed/multiz7way/phastCons/run.wib
    rm -rf /santest/scratch/galGal2/phastCons/wib
    mkdir -p /santest/scratch/galGal2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /santest/scratch/galGal2/phastCons/wib
zcat `ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 52 of 52 jobs
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest finished job:              27s       0.45m     0.01h    0.00d
#Submission to last job:            30s       0.50m     0.01h    0.00d

    # back on kolossus, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/galGal2/bed/multiz7way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /santest/scratch/galGal2/phastCons/wib .
    rsync -av /santest/scratch/galGal2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/galGal2/multiz7way/wib
    cd /cluster/data/galGal2/bed/multiz7way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/galGal2/multiz7way/wib/
    hgLoadWiggle galGal2 phastCons7way \
      -pathPrefix=/gbdb/galGal2/multiz7way/wib wib/*.wig
    rm wiggle.tab

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    # Don't have group write perms to /cse/grads/acs/ so can't do this anymore:
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/gg-top5000-7way \
      "top 5000 conserved elements (7way)" galGal2

    # and clean up bluearc.
    rm -r /santest/scratch/galGal2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /santest/scratch/galGal2/chrom


# MULTIZ CHICKEN/HUMAN/OPOSSUM/ZEBRAFISH (DONE 4/28/05 angie)
    # Backup plan in case QA doesn't have time to release the 7way and 
    # all the pairwise it uses just these nicely spaced critters:
    # tree: ((galGal2 (hg17 monDom1)) danRer2)
    ssh kolossus
    # Put run dir on /santest/scratch for now, to ease load on fileservers:
    mkdir -p /santest/scratch/galGal2/multiz4way.2005-04-28
    cd /santest/scratch/galGal2/multiz4way.2005-04-28
    # Setup: Copy pairwise MAF to /santest/scratch:
    mkdir /santest/scratch/chickenMultiz4way
    foreach db (hg17 monDom1 danRer2)
      cp -pR /cluster/data/galGal2/bed/blastz.$db/mafNet \
        /santest/scratch/chickenMultiz4way/$db
    end
    # Most but not all of those dirs contain gzip'd maf -- gzip where 
    # necessary to make them all uniform (& less cluster I/O):
    gzip /santest/scratch/chickenMultiz4way/*/*.maf
    # Make output dir:
    mkdir maf
    # Create script to run multiz.v10 in the right order:
    cat << '_EOF_' > doMultiz.csh
#!/bin/csh -fe
set chr = $1
set tmp = /scratch/tmp/chickenMultiz4way.$chr
mkdir $tmp

set REF = galGal2.$chr
set HUM = /santest/scratch/chickenMultiz4way/hg17/$chr.maf.gz
set MON = /santest/scratch/chickenMultiz4way/monDom1/$chr.maf.gz
set DAN = /santest/scratch/chickenMultiz4way/danRer2/$chr.maf.gz
set DEST = /santest/scratch/galGal2/multiz4way.2005-04-28/maf/$chr.maf

set MZ10 = /cluster/bin/penn/multiz.v10
set PROJECT = /cluster/bin/penn/maf_project

# final tree: ((galGal2 (hg17 monDom1)) danRer2)

# subtree: ((galGal2 (hg17 monDon1))
if ( -s $HUM && -s $MON ) then 
    echo "Aligning $HUM $MON..."
    zcat $HUM > $tmp/$chr.Hum.maf
    zcat $MON > $tmp/$chr.Mon.maf
    $MZ10 $tmp/$chr.Hum.maf $tmp/$chr.Mon.maf 0 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMon.maf
else if ( -s $HUM ) then
    zcat $HUM > $tmp/$chr.HumMon.maf
else if ( -s $MON ) then
    zcat $MON > $tmp/$chr.HumMon.maf
endif

# final tree: ((galGal2 (hg17 monDom1)) danRer2)
if ( -s $tmp/$chr.HumMon.maf && -s $DAN ) then
    echo "Aligning $tmp/$chr.HumMon.maf $DAN..."
    zcat $DAN > $tmp/$chr.Dan.maf
    $MZ10 $tmp/$chr.HumMon.maf $tmp/$chr.Dan.maf 1 > $tmp/$chr.tmp.maf
    echo "Projecting on $REF..."
    $PROJECT $tmp/$chr.tmp.maf $REF > $tmp/$chr.HumMonDan.maf
    mv $tmp/$chr.HumMonDan.maf $DEST
else if ( -s $DAN ) then
    zcat $DAN > $DEST
    touch $tmp/$chr.stub.maf
else if ( -s $tmp/$chr.HumMon.maf ) then
    cp $tmp/$chr.HumMon.maf $DEST
endif 

rm $tmp/$chr.*.maf
rmdir $tmp
'_EOF_'
    # << keep emacs coloring happy    
    chmod 755 doMultiz.csh
    awk '{print "./doMultiz.csh " $1;}' /cluster/data/galGal2/chrom.sizes \
      > jobs.lst
    # chr6_random and chr8_random always cause problems due to lack of 
    # alignment coverage -- edit them out of the list now.  
    # Run on small cluster
    ssh kki
    cd /santest/scratch/galGal2/multiz4way.2005-04-28
    para create jobs.lst
    para try, check, push, check, ...
#Completed: 52 of 52 jobs
#Average job time:                  55s       0.92m     0.02h    0.00d
#Longest finished job:             384s       6.40m     0.11h    0.00d
#Submission to last job:           417s       6.95m     0.12h    0.00d

    # Copy back over to /cluster/store-land
    rsync -av /santest/scratch/galGal2/multiz4way.2005-04-28 \
      /cluster/data/galGal2/bed
    ln -s /cluster/data/galGal2/bed/multiz4way.2005-04-28 \
      /cluster/data/galGal2/bed/multiz4way

    # make /gbdb/ links to 4way maf files:
    ssh hgwdev
    mkdir -p /gbdb/galGal2/multiz4way/maf/multiz4way
    ln -s /cluster/data/galGal2/bed/multiz4way/maf/chr*.maf \
      /gbdb/galGal2/multiz4way/maf/multiz4way/

    # load into database
    cd /tmp
    hgLoadMaf -warn galGal2 multiz4way \
      -pathPrefix=/gbdb/galGal2/multiz4way/maf/multiz4way

    # put 4way MAF out for download
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz4way
    mkdir mafDownload
    foreach f (maf/*.maf)
      nice gzip -c $f > mafDownload/$f:t.gz
    end
    cd mafDownload
    nice md5sum *.maf.gz > md5sum.txt
    # make a README.txt there too.
    ssh hgwdev
    mkdir /usr/local/apache/htdocs/goldenPath/galGal2/multiz4way
    ln -s /cluster/data/galGal2/bed/multiz4way/mafDownload/{*.maf.gz,md5sum.txt,README.txt} \
      /usr/local/apache/htdocs/goldenPath/galGal2/multiz4way

    # Cleanup
    rm -rf /santest/scratch/galGal2/multiz4way.2005-04-28
    rm -rf /santest/scratch/chickenMultiz4way/


# PHASTCONS 4WAY WITH METHODS FROM PAPER (DONE 4/28/05 angie)
    # Backup plan in case QA doesn't have time to release the 7way and 
    # all the pairwise it uses just these nicely spaced critters:
    # ((galGal2 (hg17 monDom1)) danRer2)
    ssh kolossus
    mkdir -p /santest/scratch/galGal2/chrom
    cp -p /cluster/data/galGal2/*/chr*.fa /santest/scratch/galGal2/chrom/
    mkdir /santest/scratch/galGal2/maf4way
    cp -p /cluster/data/galGal2/bed/multiz4way/maf/*.maf \
      /santest/scratch/galGal2/maf4way
    # Split chrom fa & maf into smaller windows for phastCons:
    ssh kki
    mkdir /cluster/data/galGal2/bed/multiz4way/phastCons
    mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.split
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.split
    set WINDOWS = /santest/scratch/galGal2/phastCons/WINDOWS
    rm -fr $WINDOWS
    mkdir -p $WINDOWS
    cat << 'EOF' > doSplit.sh
#!/bin/csh -ef

set PHAST=/cluster/bin/phast
set FA_SRC=/santest/scratch/galGal2/chrom
set WINDOWS=/santest/scratch/galGal2/phastCons/WINDOWS

set maf=$1
set c = $maf:t:r:r
set tmpDir = /scratch/msa_split/$c
rm -rf $tmpDir
mkdir -p $tmpDir
${PHAST}/msa_split $1 -i MAF -M ${FA_SRC}/$c.fa \
   -O galGal2,hg17,monDom1,danRer2 \
   -w 1000000,0 -r $tmpDir/$c -o SS -I 1000 -B 5000
cd $tmpDir
foreach file ($c.*.ss)
  gzip -c $file > ${WINDOWS}/$file.gz
end
rm -f $tmpDir/$c.*.ss
rmdir $tmpDir
'EOF'
# << for emacs
    chmod a+x doSplit.sh
    rm -f jobList
    foreach file (/santest/scratch/galGal2/maf4way/*.maf) 
      if (-s $file) then
        echo "doSplit.sh {check in exists+ $file}" >> jobList
      endif
    end
    para create jobList
    para try,  check, push, check...
#Completed: 51 of 52 jobs
#Crashed: 1 jobs
#Average job time:                  28s       0.47m     0.01h    0.00d
#Longest finished job:             266s       4.43m     0.07h    0.00d
#Submission to last job:           266s       4.43m     0.07h    0.00d
    # 1 failure because there wasn't enough info in chrE64, oh well:
#WARNING: skipping partition 1; insufficient informative sites.

    # Get genome-wide all-species average GC content 
    # by running msa_view --aggregate --summary-only on the WINDOWS .SS files.
    # msa_view needs seekable single-ss file, so unpack each to /tmp and 
    # compute overall average:
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/
    cp /dev/null gcs.txt
    # Do this on just a random sample of windows:
    foreach f (`ls -1 /santest/scratch/galGal2/phastCons/WINDOWS/* \
                | randomLines stdin 200 stdout`)
      echo $f:t:r:r
      zcat $f > /tmp/galGal2.sample.ss
      /cluster/bin/phast/msa_view \
        --aggregate galGal2,hg17,monDom1,danRer2 \
        -i SS --summary-only /tmp/galGal2.sample.ss \
      | awk '$1 == "[aggregate]" {printf "%0.4f\n", $3 + $4}' \
      >> gcs.txt
    end
    ave gcs.txt
#average 0.418849
    # OK, so use 0.419 as the --gc parameter below.
    rm /tmp/galGal2.sample.ss

    ############### FIRST ITERATION OF PARAMETER ESTIMATION ONLY #############
    # Use phyloFit to make an initial model:
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz4way/phastCons
    /cluster/bin/phast/msa_view -i MAF \
      /santest/scratch/galGal2/maf4way/chr{1,5,10,Z_random,32}.maf \
        --aggregate galGal2,hg17,monDom1,danRer2 \
        > /santest/scratch/galGal2/phastCons/all.ss
    /cluster/bin/phast/phyloFit /santest/scratch/galGal2/phastCons/all.ss \
      --tree "((galGal2,(hg17,monDom1)),danRer2)" \
      -i SS --out-root starting-tree
    cat starting-tree.mod 
#TREE: ((galGal2:0.217116,(hg17:0.159841,monDom1:0.154894):0.109434):0.197513,danRer2:0.197513);

    # Use consEntropy --NH to make it suggest a --expected-lengths param 
    # that we should try next.  Adam ran this on hg17 to find out the 
    # total entropy of the hg17 model:
    # consEntropy 0.265 12 ave.cons.mod ave.noncons.mod
#Transition parameters: gamma=0.265000, omega=12.000000, mu=0.083333, nu=0.030045
#Relative entropy: H=0.608216 bits/site
#Required length: N=16.085437 sites
#Total entropy: NH=9.783421 bits
    # Our target is that NH result: 9.7834 bits.  
    # phyloFit makes one .mod but consEntropy wants two...
    # Multiply each subst rate on the TREE line by 3.3 which is roughly the 
    # ratio of noncons to cons in 
    # /cluster/data/galGal2/bed/multiz7way/cons/run.estimate/ave.*.mod
    /cluster/bin/phast/tree_doctor -s 3.3 starting-tree.mod \
      > starting-tree.noncons.mod
    # Use params from last run of 7way:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.22 12.4 \
      starting-tree*.mod
#( Solving for new omega: 12.400000 10.781171 10.586253 10.582989 )
#Transition parameters: gamma=0.220000, omega=12.400000, mu=0.080645, nu=0.022746
#Relative entropy: H=0.765400 bits/site
#Required length: N=13.193791 sites
#Total entropy: NH=10.098533 bits
#Recommended expected length: omega=10.582989 sites (for NH=9.783400)
    # OK, start with 10.6 below.

    ############## SUBSEQUENT ITERATIONS OF PARAM ESTIMATION ONLY ###########
    # We're here because the actual target coverage was not satisfactory,
    # so we're changing the --target-coverage param.  Given that we're 
    # changing that, take a guess at how we should change --expected-lengths
    # in order to also hit the total entropy target.
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.35 12 ave.{cons,noncons}.mod
#Recommended expected length: omega=18.452486 sites (for NH=9.783400)
    # OK, --expected-lengths 18.5

    # Now set up cluster job to estimate model parameters given free params 
    # --target-coverage and --expected-lengths and the data.  
    ssh kk9
    mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate
    # FIRST ITERATION: Use ../starting-tree.noncons.mod:
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ../starting-tree.noncons.mod --gc 0.419 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 10.6 --target-coverage 0.22 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    # SUBSEQUENT ITERATIONS: Use last iteration's estimated noncons model.
    cat << '_EOF_' > doEstimate.sh
#!/bin/csh -ef
zcat $1 \
| /cluster/bin/phast/phastCons - ave.noncons.mod --gc 0.419 --nrates 1,1 \
    --no-post-probs --ignore-missing \
    --expected-lengths 18.5 --target-coverage 0.35 \
    --quiet --log $2 --estimate-trees $3
'_EOF_'
# << for emacs
    chmod a+x doEstimate.sh
    rm -fr LOG TREES
    mkdir -p LOG TREES
    rm -f jobList
    foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz)
      set root = $f:t:r:r
      echo doEstimate.sh $f LOG/$root.log TREES/$root >> jobList
    end
    # run cluster job
    para create jobList
    para try, check, push, check, ...
#Completed: 1160 of 1160 jobs
#Average job time:                  16s       0.26m     0.00h    0.00d
#Longest finished job:              45s       0.75m     0.01h    0.00d
#Submission to last job:           285s       4.75m     0.08h    0.00d

    # Now combine parameter estimates.  We can average the .mod files
    # using phyloBoot.  This must be done separately for the conserved
    # and nonconserved models
    ssh kolossus
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.estimate
    ls -1 TREES/*.cons.mod > cons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*cons.txt' \
      --output-average ave.cons.mod > cons_summary.txt
    ls -1 TREES/*.noncons.mod > noncons.txt
    /cluster/bin/phast/phyloBoot --read-mods '*noncons.txt' \
      --output-average ave.noncons.mod > noncons_summary.txt
    grep TREE ave*.mod
    # FIRST ITERATION:
#ave.cons.mod:TREE: ((galGal2:0.069779,(hg17:0.067234,monDom1:0.063760):0.048611):0.083881,danRer2:0.083881);
#ave.noncons.mod:TREE: ((galGal2:0.250372,(hg17:0.241752,monDom1:0.227546):0.176240):0.307772,danRer2:0.307772);
    # SECOND ITERATION:
#ave.cons.mod:TREE: ((galGal2:0.084421,(hg17:0.080762,monDom1:0.076024):0.057691):0.102428,danRer2:0.102428);
#ave.noncons.mod:TREE: ((galGal2:0.266716,(hg17:0.255716,monDom1:0.239179):0.183533):0.330915,danRer2:0.330915);

    cat cons_summary.txt 
    # look over the files cons_summary.txt and noncons_summary.txt.
    # The means and medians should be roughly equal and the stdevs
    # should be reasonably small compared to the means, particularly
    # for rate matrix parameters (at bottom) and for branches to the
    # leaves of the tree.  The stdevs may be fairly high for branches
    # near the root of the tree; that's okay.  Some min values may be
    # 0 for some parameters.  That's okay, but watch out for very large
    # values in the max column, which might skew the mean.  If you see
    # any signs of bad outliers, you may have to track down the
    # responsible .mod files and throw them out.  I've never had to do
    # this; the estimates generally seem pretty well behaved.

    # NOTE: Actually, a random sample of several hundred to a thousand
    # alignment fragments (say, a number equal to the number of
    # available cluster nodes) should be more than adequate for
    # parameter estimation.  If pressed for time, use this strategy.

    # Check the total entropy figure to see if we're way off.
    # We probably don't need to do this, since it has always been very close 
    # if not the same as what we used above, but it only takes a second.
    # FIRST ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.22 10.6 ave.{cons,noncons}.mod
#Recommended expected length: omega=7.336732 sites (for NH=9.783400)
    # That is so low!  But OK, try it below.
    # SECOND ITERATION:
    /cluster/bin/phast/consEntropy --NH 9.7834 0.35 18.5 ave.{cons,noncons}.mod
#Recommended expected length: omega=17.857022 sites (for NH=9.783400)
    # OK, tweak --expected-lengths to 17.9 below (and for next iteration).

    # Now we are ready to set up the cluster job for computing the
    # conservation scores and predicted elements.  The we measure the 
    # conserved elements coverage, and if that's not satisfactory then we 
    # adjust parameters and repeat.  
    ssh kk9
    mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.phast
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.phast
    cat << 'EOF' > doPhastCons.sh
#!/bin/csh -ef
set pref = $1:t:r:r
set chr = `echo $pref | awk -F\. '{print $1}'`
set tmpfile = /scratch/phastCons.$$
zcat $1 \
| /cluster/bin/phast/phastCons - \
    ../run.estimate/ave.cons.mod,../run.estimate/ave.noncons.mod \
    --expected-lengths 17.9 --target-coverage 0.35 \
    --quiet --seqname $chr --idpref $pref \
    --viterbi /santest/scratch/galGal2/phastCons/ELEMENTS/$pref.bed --score \
    --require-informative 0 \
  > $tmpfile
gzip -c $tmpfile > /santest/scratch/galGal2/phastCons/POSTPROBS/$pref.pp.gz
rm $tmpfile
'EOF'
# << for emacs
    chmod a+x doPhastCons.sh
    rm -fr /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS}
    mkdir -p /santest/scratch/galGal2/phastCons/{POSTPROBS,ELEMENTS}
    rm -f jobList
    foreach f (/santest/scratch/galGal2/phastCons/WINDOWS/*.ss.gz)
      echo doPhastCons.sh $f >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 1160 of 1160 jobs
#Average job time:                  10s       0.17m     0.00h    0.00d
#Longest finished job:              16s       0.27m     0.00h    0.00d
#Submission to last job:           138s       2.30m     0.04h    0.00d

    # back on kolossus:
    # combine predictions and transform scores to be in 0-1000 interval
    cd /cluster/data/galGal2/bed/multiz4way/phastCons
    awk '{printf "%s\t%d\t%d\tlod=%d\t%s\n", $1, $2, $3, $5, $5;}' \
      /santest/scratch/galGal2/phastCons/ELEMENTS/*.bed \
    | /cluster/bin/scripts/lodToBedScore > all.bed

    ssh hgwdev
    # Now measure coverage of CDS by conserved elements. 
    # We want the "cover" figure to be close to 68.9%.
    # But what to use as CDS?  There's a refGene but it's terribly incomplete.
    # There's a human proteins track, but that's only what human and chicken 
    # have in common.  There's a xenoRefGene too... again comparative-
    # influenced, but a lot more complete than refGene presumably.
    # So just go with first cut results for now?
    cd /cluster/data/galGal2/bed/multiz4way/phastCons
    featureBits -enrichment galGal2 refGene:cds all.bed
    featureBits -enrichment galGal2 xenoRefGene:cds all.bed
    featureBits -enrichment galGal2 blastHg17KG all.bed
    # FIRST ITERATION: too low, increase --target-coverage and try again.
#refGene:cds 0.340%, all.bed 2.488%, both 0.187%, cover 55.04%, enrich 22.12x
#xenoRefGene:cds 1.644%, all.bed 2.488%, both 1.011%, cover 61.48%, enrich 24.71x
#blastHg17KG 2.001%, all.bed 2.488%, both 0.993%, cover 49.63%, enrich 19.95x
    # SECOND ITERATION: this'll do:
#refGene:cds 0.340%, all.bed 3.497%, both 0.231%, cover 67.79%, enrich 19.39x
#xenoRefGene:cds 1.644%, all.bed 3.497%, both 1.260%, cover 76.62%, enrich 21.91x
#blastHg17KG 2.001%, all.bed 3.497%, both 1.263%, cover 63.12%, enrich 18.05x

    # Having met the CDS coverage target, load up the results.
    hgLoadBed galGal2 phastConsElements4way all.bed

    # Create wiggle on the small cluster
    ssh kki
    mkdir /cluster/data/galGal2/bed/multiz4way/phastCons/run.wib
    cd /cluster/data/galGal2/bed/multiz4way/phastCons/run.wib
    rm -rf /santest/scratch/galGal2/phastCons/wib
    mkdir -p /santest/scratch/galGal2/phastCons/wib
    cat << 'EOF' > doWigEncode
#!/bin/csh -ef
set chr = $1
cd /santest/scratch/galGal2/phastCons/wib
zcat `ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS/$chr.*.pp.gz \
      | sort -t\. -k2,2n` \
| wigEncode stdin ${chr}_phastCons.wi{g,b}
'EOF'
# << for emacs
    chmod a+x doWigEncode
    rm -f jobList
    foreach chr (`ls -1 /santest/scratch/galGal2/phastCons/POSTPROBS \
                  | awk -F\. '{print $1}' | sort -u`)
      echo doWigEncode $chr >> jobList
    end
    para create jobList
    para try, check, push, check, ...
#Completed: 51 of 51 jobs
#Average job time:                   5s       0.09m     0.00h    0.00d
#Longest finished job:              21s       0.35m     0.01h    0.00d
#Submission to last job:            32s       0.53m     0.01h    0.00d

    # back on kolossus, copy wibs, wigs and POSTPROBS (people sometimes want 
    # the raw scores) from bluearc
    cd /cluster/data/galGal2/bed/multiz4way/phastCons
    rm -rf wib POSTPROBS
    rsync -av /santest/scratch/galGal2/phastCons/wib .
    rsync -av /santest/scratch/galGal2/phastCons/POSTPROBS .

    # load wiggle component of Conservation track
    ssh hgwdev
    mkdir /gbdb/galGal2/multiz4way/wib
    cd /cluster/data/galGal2/bed/multiz4way/phastCons
    chmod 775 . wib
    chmod 664 wib/*.wib
    ln -s `pwd`/wib/*.wib /gbdb/galGal2/multiz4way/wib/
    hgLoadWiggle galGal2 phastCons4way \
      -pathPrefix=/gbdb/galGal2/multiz4way/wib wib/*.wig
    rm wiggle.tab

    # make top-5000 list and launcher on Adam's home page:
    sed -e 's/lod=//' all.bed | sort -k4,4nr | head -5000 \
    | awk '{printf "%s\t%d\t%d\tlod=%d\t%d\n", $1, $2, $3, $4, $4}' \
    > top5000.bed
    # Don't have group write perms to /cse/grads/acs/ so can't do this anymore:
    /cluster/home/acs/bin/make-launcher-with-scores.sh top5000.bed \
      /cse/grads/acs/public_html/gg-top5000-4way \
      "top 5000 conserved elements (4way)" galGal2

    # and clean up bluearc.
    rm -r /santest/scratch/galGal2/phastCons/{ELEMENTS,POSTPROBS,wib}
    rm -r /santest/scratch/galGal2/chrom


# MAKE AN NCBI LIFT FILE FOR CHICKEN (DONE 4/29/05 angie)
    # In order to import dbSNP data, must lift NCBI-named contigs to 
    # chromosomes.  I don't see an agp file in the chicken ftp dir, 
    # so seq_contig.md will have to do:
    cd /cluster/data/galGal2/jkStuff
    wget ftp://ftp.ncbi.nih.gov/genomes/Gallus_gallus/seq_contig.md.gz
    # Wow, *lots* of differences between NCBI and our AGP on chrUn,
    # and some on chr{W,Z}_random as well.  Resolved them by making 
    # small edits to jkStuff/chr{Un,W_random,Z_random}.ncbiAlike.agp 
    # and writing a galGal2-tailored script to attempt to line up seq_contig 
    # and AGP by size/order of sequence.. icch, but that's the best 
    # I can do without an AGP from NCBI!  Also, seq_contig shows 
    # everything as + strand... I hope that means that NCBI reverse-
    # complemented the sequences that appear - in the AGP, but have not 
    # verified.  
    zcat seq_contig.md.gz \
    | ./seq_contig_to_lift.pl \
      > liftNcbi.lft
    awk '{print $4 " " $5;}' liftNcbi.lft | uniq | sort > /tmp/1
    awk '{print $1 " " $2;}' ../chrom.sizes | sort > /tmp/2
    diff /tmp/1 /tmp/2
#43c43
#< chr8_random 5783
#---
#> chr8_random 15783
    # chr8_random diff is NBD: the agp has an exta gap at the end.


# LOAD DBSNP (DONE 4/29/05 angie)
    # Using Daryl's method from makeHg17.doc...
    ssh kolossus
    set db    = galGal2
    set org   = chicken
    set build = 124
    set dir   = /santest/scratch/snp/$db/build$build
    mkdir -p $dir $dir/ds_ch.xml $dir/det $dir/str $dir/loc $dir/seq
    cd $dir

    # started 8:10pm - finished in ~10min, not nearly as much for chicken 
    # as human!
    # Check to make sure the chrMT file is included
    screen
      ftp ftp.ncbi.nih.gov
        cd snp/$org/XML
        prompt
        mget ds_ch*.xml.gz
      exit # screen
    exit # machine

    # Don't know what Daryl means by this: check chromStart for each locType

    cp -f {$HOME}/kent/src/hg/snp/parseDbSnpXML /cluster/bin/scripts
    chmod 775 /cluster/bin/scripts/parseDbSnpXML

    ssh kk9
    set db    = galGal2
    set org   = chicken
    set build = 124
    set dir   = /santest/scratch/snp/$db/build$build
    cd $dir
    touch jobList
    foreach file ( $dir/ds_ch*.xml.gz )
      set out = $file:t:r
      echo /cluster/bin/scripts/parseDbSnpXML $file $dir $out.contig >> jobList
    end

    # I removed these from the job list:
    # ds_chMulti.xml.gz
    # ds_chNotOn.xml.gz
    # ds_chLG.xml.gz
    # ds_chMasked.xml.gz
    para create jobList
    para push; para check ... 
#Completed: 35 of 35 jobs
#Average job time:                 117s       1.95m     0.03h    0.00d
#Longest finished job:             777s      12.95m     0.22h    0.01d
#Submission to last job:           777s      12.95m     0.22h    0.01d
    # exit kk

    # back on kolossus:
    cd $dir
    # some of the NT contigs are not in the liftSpec - this is
    # expected as snps that map to alternate assemblies (Celera) are
    # in the original files, but we disregard their mappings.
    time liftUp $db.build$build.bed /cluster/data/$db/jkStuff/liftNcbi.lft \
       warn det/ds_ch*.xml.contig.det.gz |& grep liftSpec \
    | awk '{print $1;}' | sort -u > notLifted.log
#19.722u 3.497s 0:27.95 83.0%    0+0k 0+0io 0pf+0w
    time gzip $db.build$build.bed
#13.034u 0.318s 0:14.07 94.8%    0+0k 0+0io 0pf+0w
    du -sh .
#558M    .

    mkdir -p /cluster/data/$db/bed/snp
    rsync -av $dir /cluster/data/$db/bed/snp
    set dir = /cluster/data/$db/bed/snp/build$build

    ssh hgwdev
    set db    = galGal2
    set org   = chicken
    set build = 124
    set dir = /cluster/data/$db/bed/snp/build$build
    cd $dir
    # hgLoadBed is the important step - check to make sure there are no warnings
    time nice hgLoadBed $db snp $db.build$build.bed.gz \
      -sqlTable=${HOME}/kent/src/hg/lib/snp.sql
#Loaded 2863139 elements of size 16
#82.270u 14.920s 15:07.24 10.7%  0+0k 0+0io 17803pf+0w
    # If there are warnings, do this to analyze what didn't get loaded right:
    #zcat $db.build$build.bed.gz | sort > /tmp/1
    #echo select \* from snp | hgsql -N galGal2 \
    #| perl -wpe 's/^[0-9]+\t//' | sort > /tmp/2
    #diff /tmp/1 /tmp/2 | less
    # Copy over snpExceptions table from hg17:
    hgsql galGal2 -e \
      'create table snpExceptions select * from hg17.snpExceptions'

    # basic snp table is now loaded, but exception column needs to be updated

    # run queries from snpException.query against snp table
    umask 002
    mkdir -p /usr/local/apache/htdocs/qa/test-results/snpException/$db/build$build
    cd       /usr/local/apache/htdocs/qa/test-results/snpException/$db/build$build
    time nice snpException $db 0 ${db}snpException > ${db}snpException.log
#0.690u 1.300s 13:37.00 0.2%     0+0k 0+0io 206pf+0w

    # check alignment of flanking sequences
    time nice snpValid $db /cluster/data/$db/bed/snp/build$build/seq \
      > ${db}snpValid.log
#1110.730u 40.640s 21:22.09 89.8%        0+0k 0+0io 2942pf+0w



#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# CHICKEN 13K ARRAY PROBES (DONE 5/19/2005 Andy)
    mkdir /cluster/data/galGal2/bed/chicken13k
    cd /cluster/data/galGal2/bed/chicken13k
    wget http://www.fhcrc.org/shared_resources/genomics/dna_array/spotted_arrays/chicken_array/chicken13k_anno_v2.0.txt
    cut -f1,8,9,10 chicken13k_anno_v2.0.txt | grep chr \
      | awk '{printf("%s\t%s\t%s\t%s\n", $2, $3, $4, $1)}' > chicken13k.bed
    # Not all the probes are mapped.
    tail +2 chicken13k_anno_v2.0.txt | wc -l
# 15769
    wc -l chicken13k.bed
# 13174
    # So that's a loss of about 17% of the probes.
    # Double-checked the half-open bed start in the browser with the corresponding EST.
    # It appears to be bed-ready.
    hgLoadBed galGal2 chicken13k chicken13k.bed
    # trackDb entry:
track chicken13k
shortLabel 13k Probes
longLabel Probes for 13k Microarray
group regulation
priority 91.0
visibility hide
type bed 4 .
    # trackDb findSpec:
searchName chicken13k
searchTable chicken13k
searchType bed
searchMethod exact
searchPriority 55
   # Add information to the chicken13kInfo table:
   cut -f1-7,11-15 chicken13k_anno_v2.0.txt | tail +2 > chicken13kInfo.txt
   hgsql galGal2 < ~/kent/src/hg/lib/chicken13kInfo.sql
   echo "load data local infile 'chicken13kInfo.txt' into table chicken13kinfo" | hgsql galGal2


############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-18 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzGalGal2.2006-02-18
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        `pwd`/DEF > swap.out 2>&1 &

    time nice -n +19 featureBits galGal2 chainMm8Link
    #   57074100 bases of 1054197620 (5.414%) in intersection


# SWAP CHAINS/NET RN4 (DONE 3/30/06 angie)
    ssh kkr3u00
    mkdir /cluster/data/galGal2/bed/blastz.rn4.swap
    cd /cluster/data/galGal2/bed/blastz.rn4.swap
    doBlastzChainNet.pl -swap /cluster/data/rn4/bed/blastz.galGal2/DEF \
      -workhorse kkr3u00 >& do.log & tail -f do.log
    ln -s blastz.rn4.swap /cluster/data/galGal2/bed/blastz.rn4


# BLASTZ/CHAIN/NET XENTRO2 (DONE 4/20/06 angie)
    ssh kkstore03
    mkdir /cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20
    cd /cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20
    cat << '_EOF_' > DEF
# chicken vs. frog
BLASTZ=/cluster/bin/penn/x86_64/blastz.v7.x86_64

# Use same params as used for galGal2-xenTro1 (see makeXenTro1.doc)
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=8000
BLASTZ_K=2200
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Chicken galGal2
SEQ1_DIR=/scratch/hg/galGal2/nib
SEQ1_LEN=/cluster/data/galGal2/chrom.sizes
SEQ1_CHUNK=50000000
SEQ1_LAP=10000

# QUERY: Frog xenTro2 - single chunk big enough to run two of the
#               largest scaffolds in one job
SEQ2_DIR=/scratch/hg/xenTro2/xenTro2.2bit
SEQ2_LEN=/san/sanvol1/scratch/xenTro2/chrom.sizes
SEQ2_CHUNK=20000000
SEQ2_LAP=0
SEQ2_LIMIT=100

BASE=/cluster/data/galGal2/bed/blastz.xenTro2.2006-04-20
'_EOF_'
    # << emacs
    doBlastzChainNet.pl -blastzOutRoot=/san/sanvol1/galGal2XenTro2 \
      -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose DEF \
      >& do.log & tail -f do.log
    ln -s blastz.xenTro2.2006-04-20 /cluster/data/galGal2/bed/blastz.xenTro2


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/galGal2/bed/genomicSuperDups
    cd /cluster/data/galGal2/bed/genomicSuperDups
    sed -e 's/\t_\t/\t-\t/' galGal2_genomicSuperDup.tab \
    | awk '($3 - $2) >= 1000 && ($9 - $8) >= 1000 {print;}' \
    | hgLoadBed galGal2 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql


