# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the mouse
# genome, February 2003 build.

DOWNLOAD THE MOUSE SEQUENCE FROM NCBI (DONE 02/06/03)
    mkdir -p /cluster/store2/mm.2003.02/ncbi
    cd /cluster/store2/mm.2003.02/ncbi
    mkdir chrfasta contigfasta
    ftp ftp.ncbi.nih.gov
      # user hgpguest, password from /cse/guests/kent/buildHg6.doc
      cd mouse_30
      prompt
      bin
      mget *
      quit
    gunzip *.agp.gz

GET UPDATED MOUSE ASSEMBLY FILES FROM NCBI (DONE 02/19/03)
    # Deanna regenerated allrefcontig.chr.agp for us with _random's.  
    # While downloading that, I noticed some other new files.  I saved 
    # the original copies of those to *.orig.  
    cd /cluster/store2/mm.2003.02/ncbi
    ftp ftp.ncbi.nih.gov
      # user hgpguest, password from /cse/guests/kent/buildHg6.doc
      cd mouse_30
      prompt
      bin
      mget allrefcontig.chr.agp.gz contig.idmap ctg_coords seq_contig.md
      quit    

BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 02/06/03)

    # This version of the mouse sequence data is in 
    # /cluster/store2/mm.2003.02/mm3/assembly
    # This will split the mouse sequence into approx. 5 Mbase supercontigs 
    # between non-bridged clone contigs and drop the resulting dir structure 
    # in /cluster/store2/mm.2003.02/mm3.
    # The resulting dir structure will include 1 dir for each chromosome, 
    # each of which has a set of subdirectories, one subdir per supercontig. 
    ssh hgwdev
    # cd into your CVS source tree under kent/src/hg/splitFaIntoContigs
    make
    ssh kkstore
    mkdir /cluster/store2/mm.2003.02/mm3
    cd /cluster/store2/mm.2003.02/mm3
    # splitFaIntoContigs doesn't do right with agp lines arriving in a 
    # different order than fasta chrom sequences.  so split up the agp 
    # into one per chrom.
    foreach c ( 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Y Un )
      mkdir $c
      perl -we "while(<>){if (/^chr$c\t/) {print;}}" \
        ../ncbi/allrefcontig.chr.agp \
        > $c/chr$c.agp
      gunzip -c /cluster/store2/mm.2003.02/ncbi/chrfasta/chr$c.fa.gz \
        | perl -wpe 's/^>lcl\|(chr\w+)\.fa.*/>$1/' \
        | splitFaIntoContigs $c/chr$c.agp \
          stdin /cluster/store2/mm.2003.02/mm3 -nSize=5000000
    end

CREATE CHROM-LEVEL AGP AND FASTA FOR _RANDOMS (DONE 02/20/03)
    ssh kkstore
    cd /cluster/store2/mm.2003.02/ncbi
    ../mm3/jkStuff/ncbiToRandomAgps seq_contig.md allrefcontig.chr.agp \
      contig.idmap ../mm3
    cd /cluster/store2/mm.2003.02/mm3
    foreach c (?{,?})
      if (-e $c/chr${c}_random.ctg.agp) then
        echo building $c/chr${c}_random.fa
        gunzip -c ../ncbi/contigfasta/chr$c.fa.gz \
          | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
          > ./tmp.fa
        agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
          $c/chr${c}_random.fa ./tmp.fa
        rm tmp.fa
      endif
    end
    # At 37402 sequences with 50000-long gaps in between, chrUn is just 
    # too damn big to fit in memory on any of our machines!!  So use a 
    # gap size of 1000 (OK with Jim and Terry) just for chrUn.  
    cd /cluster/store2/mm.2003.02/ncbi
    ../mm3/jkStuff/ncbiToRandomAgps -gapLen 1000 -chrom Un \
      seq_contig.md allrefcontig.chr.agp contig.idmap ../mm3
    cd /cluster/store2/mm.2003.02/mm3
    set c=Un
    echo building $c/chr${c}_random.fa
    gunzip -c ../ncbi/contigfasta/chr$c.fa.gz \
      | perl -wpe 's/^>lcl\|(Mm\w+)\s+.*$/>$1/' \
      > ./tmp.fa
    agpToFa -simpleMulti $c/chr${c}_random.ctg.agp chr${c}_random \
      $c/chr${c}_random.fa ./tmp.fa
    rm tmp.fa
    # Clean these up to avoid confusion later... they're easily rebuilt.
    rm ?{,?}/*.ctg.agp

BREAK UP _RANDOMS INTO 5 MB CHUNKS AT NON-BRIDGED CONTIGS (DONE 02/20/03)
    ssh kkstore
    cd /cluster/store2/mm.2003.02/mm3
    foreach c (?{,?})
      if (-e $c/chr${c}_random.agp) then
        splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
    end

CREATING DATABASE (DONE 02/06/03)

o - Create the database.
     - ssh hgwdev
     - Enter mysql via:
           mysql -u hgcat -pbigsecret
     - At mysql prompt type:
        create database mm3;
        quit
     - make a semi-permanent read-only alias:
        alias mm3 "mysql -u hguser -phguserstuff -A mm3"
o - Use df to ake sure there is at least 5 gig free on hgwdev:/var/lib/mysql


CREATING GRP TABLE FOR TRACK GROUPING (DONE 02/11/03)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from mm2.grp" \
      | hgsql mm3


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 02/20/03)

    # Create (unmasked) nib files 
    ssh kkstore
    cd ~/mm
    mkdir -p nib
    foreach f (?{,?}/chr*.fa)
      echo $f:t:r
      faToNib $f nib/$f:t:r.nib
    end
    # Create symbolic links from /gbdb/mm3/nib to real nib files
    ssh hgwdev
    mkdir -p /gbdb/mm3/nib
    foreach f (/cluster/store2/mm.2003.02/mm3/nib/chr*.nib)
      ln -s $f /gbdb/mm3/nib
    end

    # Load /gbdb nib paths into database and save size info.
    ssh hgwdev
    hgsql mm3  < ~/src/hg/lib/chromInfo.sql
    cd ~/mm
    hgNibSeq -preMadeNib mm3 /gbdb/mm3/nib ?/chr*.fa ??/chr*.fa 
    echo "select chrom,size from chromInfo" | hgsql -N mm3 > chrom.sizes

Store o+o info in database.
     cd /cluster/store2/mm.2003.02/mm3
     hgGoldGapGl mm3 /cluster/store2/mm.2003.02 mm3 -noGl

Make and load GC percent table
     ssh hgwdev
     mkdir -p /cluster/store2/mm.2003.02/mm3/bed/gcPercent
     cd /cluster/store2/mm.2003.02/mm3/bed/gcPercent
     hgsql mm3  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent mm3 ../../nib


ADD MAP CONTIGS TRACK (DONE 02/28/03)
    ssh hgwdev
    mkdir -p ~/mm3/bed/ctgPos
    cd ~/mm3/bed/ctgPos
    # hgCtgPos uses the lift files... but mouse lift files are for the 
    # 5MB contigs from splitFaIntoContigs, not for the real NT_ contigs 
    # from the assembly.  (In the future, we should go with the NT's!)  
    # So... just for this release, go straight from the seq_contig.md 
    # to the table def'n: contig, size, chrom, chromStart, chromEnd 
    perl -we \
     'while (<>) { \
        if (/^\d+\s+(\w+)\s+(\d+)\s+(\d+)\s+\S+\s+(NT_\d+)\s+.*ref_strain/) { \
          $chr=$1; $start=$2; $start -= 1; $end=$3; $ctg=$4; \
          print "$ctg\t" . ($end-$start) . "\tchr$chr\t$start\t$end\n"; \
        } \
      }' /cluster/store2/mm.2003.02/ncbi/seq_contig.md \
    > ctgPos.tab
    hgsql mm3 < ~/kent/src/hg/lib/ctgPos.sql
    echo "load data local infile 'ctgPos.tab' into table ctgPos" | hgsql mm3
    # Note: the info is there in seq_contig.md to also do the _random's, 
    # but we'd have to do some more work: duplicate the gaps of 50000 between 
    # contigs for all _random's except chrUn_random (1000 between).  

MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR MM3 (DONE 02/06/03)
    # Enter mm3 into hgcentraltest.dbDb so test browser knows about it:
    echo 'insert into dbDb values("mm3", "Mouse Feb. 2003", "/gbdb/mm3/nib", "Mouse", "USP18", 1, 30, "Mouse");' \
      | hgsql -h genome-testdb hgcentraltest
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add mm3 in all the right places and do
    make update
    make alpha
    cvs commit makefile


MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR MM3 (DONE 02/13/03)
    ssh hgwdev
    echo 'insert into blatServers values("mm3", "blat9", "17778", "1"); \
          insert into blatServers values("mm3", "blat9", "17779", "0");' \
      | hgsql -h genome-testdb hgcentraltest


REPEAT MASKING (DONE 03/05/03)
   Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -m -s)

        #- Split contigs into 500kb chunks:
        cd ~/mm3
        foreach d ( */chr*_?{,?} )
          cd $d
          set contig = $d:t
          faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -maxN=500000
          cd ../..
        end

        #- Make the run directory and job list:
        cd ~/mm3
        mkdir RMRun
        rm -f RMRun/RMJobs
        touch RMRun/RMJobs
        foreach d ( ?{,?}/chr*_?{,?} )
          foreach f ( $d/chr*_*_*.fa )
            set f = $f:t
            echo /cluster/bin/scripts/RMMouse \
                 /cluster/store2/mm.2003.02/mm3/$d $f \
               '{'check out line+ /cluster/store2/mm.2003.02/mm3/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
        end

        #- Do the run
        ssh kk
        cd ~/mm3/RMRun
        para create RMJobs
        para try, para check, para check, para push, para check,...

        #- Lift up the split-contig .out's to contig-level .out's
	ssh kkstore
        cd ~/mm3
        foreach d ( ?{,?}/chr*_?{,?} )
          cd $d
          set contig = $d:t
          liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end

        #- Lift up the contig-level .out's to chr-level
        cd ~/mm3
        ./jkStuff/liftOut5.sh

        #- Load the .out files into the database with:
        ssh hgwdev
        cd ~/mm3
        hgLoadOut mm3 ?/*.fa.out ??/*.fa.out

VERIFY REPEATMASKER RESULTS (IN PROGRESS)

    # Run featureBits on mm3 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits mm3 rmsk
    # --> 1079906607 bases of 2708220133 (39.875%) in intersection
    # --> (orig run, July libs:) 1001999794 bases of 2577261074 (38.878%) in intersection
    featureBits mm2 rmsk
    # --> 1037964664 bases of 2726995854 (38.063%) in intersection


MAKE LIFTALL.LFT (DONE 02/20/03)

    cd ~/mm3
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft

SIMPLE REPEAT TRACK (DONE 02/07/03; _randoms DONE 03/05/03)
    # TRF runs pretty quickly now... it takes a few hours total runtime.
    # Also, it ignores masking of input sequence, so this can be run in 
    # parallel with RepeatMasker!
    # So instead of binrsyncing and para-running, just do this on kkstore:
    ssh kkstore
    mkdir ~/mm3/bed/simpleRepeat
    cd ~/mm3/bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    touch jobs.csh
    foreach f (/cluster/store2/mm.2003.02/mm3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa)
      set fout = $f:t:r.bed
      echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh |& tee jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # When job is done do:
    liftUp simpleRepeat.bed ~/mm3/jkStuff/liftAll.lft warn trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    cd ~/mm3/bed/simpleRepeat
    hgLoadBed mm3 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


PROCESS SIMPLE REPEATS INTO MASK (DONE 02/07/03; _randoms DONE 03/05/03)

    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh kkstore
    cd ~/mm3/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/mm3
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
        $c/lift/ordered.lst > $c/lift/oTrf.lst
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
        jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end


MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/05/03)
    ssh kkstore
    cd ~/mm3
    #- Soft-mask (lower-case) the contig and chr .fa's
    tcsh jkStuff/makeFaMasked.sh
    #- Make hard-masked .fa.masked files as well:
    tcsh jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    tcsh jkStuff/makeNib.sh
    # Copy the masked contig fa to /scratch:
    rm -f /scratch/hg/mm3/trfFa
    mkdir -p /scratch/hg/mm3/trfFa
    cp -p ~/mm3/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /scratch/hg/mm3/trfFa


MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/05/03)
    ssh kkstore
    cd ~/mm3
    #- Build the .zip files
    jkStuff/zipAll.sh |& tee zipAll.log
    mkdir zip
    mv *.zip zip
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/mm3/zip
    ../jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/mmFeb2003
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Then make the upstream sequence files.
    cd bigZips
    featureBits mm3 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits mm3 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits mm3 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa


CREATE CYTOBAND TRACK
    # Can be done at any time
    ssh hgwdev
    cd /cluster/data/mm3
    mkdir cytoBand
    cd cytoBand
    # Get file from NCBI
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/maps/mapview/BUILD.30/Mouse400.dat.gz
    gunzip Mouse400.dat.gz
    # Create bed file
    /cluster/bin/scripts/createNcbiCytoBand Mouse400.dat
    # Load the bed file
    hgLoadBed -noBin -sqlTable=/cluster/home/kent/src/hg/lib/cytoBand.sql mm3 cytoBand cytoBand.bed

PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/05/03)
    # This needs to be done after trf-masking and nib generation.
    ssh kkstore
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/mm3/bed/linSpecRep
    cd ~/mm3/bed/linSpecRep
    foreach f (~/mm3/*/*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    cd ..
    rm -rf /scratch/hg/mm3/linSpecRep
    mkdir -p /scratch/hg/mm3
    cp -Rp linSpecRep /scratch/hg/mm3
    # RepeatMasker .out:
    cd ~/mm3
    rm -rf /scratch/hg/mm3/rmsk
    mkdir -p /scratch/hg/mm3/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/mm3/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/mm3/chromTrfMixedNib
    mkdir -p /scratch/hg/mm3/chromTrfMixedNib
    cp -p mixedNib/chr*.nib /scratch/hg/mm3/chromTrfMixedNib
    # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters

    # Jim's comments Feb 12 '03 about the order in which to run blastz:
    # In general we should do
    # 1) hg/mm
    # 2) mm/rn
    # 3) rn/hg
    # 4) hg/hg
    # 5) mm/mm
    # 6) rn/rn
    # There is now an 'axtSwap' program that might let us
    # get out of having to run the inverse of 1,2 & 3,  though
    # 2 in particular is so fast perhaps it's just as well to
    # do the inverse explicitly.


MAKING AND STORING mRNA AND EST ALIGNMENTS (DONE 02/09/03)

    # Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa
    # from /cluster/store2/mrna.133  into /scratch/hg/mrna.133
    # Make sure that /scratch/hg/mm3/trfFa is loaded with chr*_*.fa and pushed 
    # to the cluster nodes.
    ssh kk
    cd ~/mm/bed
    foreach i (refSeq mrna est)
      mkdir $i
      cd $i
      ls -1S /mnt/scratch/hg/mm3/trfFa/*.fa > genome.lst
      ls -1 /mnt/scratch/hg/mrna.133/Mus_musculus/$i.fa > mrna.lst
      cp -p ~/lastMm/bed/$i/gsub .
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
      cd ..
    end 

    # In each dir: para try, para check, para push, para check....
      
    # Process refSeq mRNA and EST alignments into near best in genome.
    ssh kkstore
    cd ~/mm/bed

      cd refSeq
      pslSort dirs raw.psl /cluster/store2/tmp psl
      pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl contig.psl /dev/null
      liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/store2/tmp all_refSeq.psl
      pslCat -dir chrom > refSeqAli.psl
      cd ..

      cd mrna
      pslSort dirs raw.psl /cluster/store2/tmp psl
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/store2/tmp all_mrna.psl
      cd ..

      cd est
      pslSort dirs raw.psl /cluster/store2/tmp psl
      pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl /dev/null
      liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl
      pslSortAcc nohead chrom /cluster/store2/tmp all_est.psl
      cd ..

    # Load refSeq alignments into database
      ssh hgwdev
      cd ~/mm/bed/refSeq
      hgLoadPsl mm3 -tNameIx refSeqAli.psl

    # Load mRNA alignments into database.
      ssh hgwdev
      cd ~/mm/bed/mrna/chrom
      foreach i (*.psl)
          mv $i $i:r_mrna.psl
      end
      hgLoadPsl mm3 *.psl
      cd ..
      hgLoadPsl mm3 all_mrna.psl -nobin

    # Load EST alignments into database.
      ssh hgwdev
      cd ~/mm/bed/est/chrom
      foreach i (*.psl)
            mv $i $i:r_est.psl
      end
      hgLoadPsl mm3 *.psl
      cd ..
      hgLoadPsl mm3 all_est.psl -nobin

    # Create subset of ESTs with introns and load into database.
      - ssh kkstore
      cd ~/mm
      tcsh jkStuff/makeIntronEst.sh
      - ssh hgwdev
      cd ~/mm/bed/est/intronEst
      hgLoadPsl mm3 *.psl


ADD REFSEQ/MRNA/EST ALIGNMENTS TO _RANDOMS (DONE 03/06/03)
    # Note: in future builds, we should get the _randoms in the beginning,
    # so this should not be necessary!  
    ssh kk
    cd ~/mm/bed
    # refSeq, mrna done -- est too large for pslSort when run on unmasked, 
    # so rerun est on masked.
    foreach i (refSeq mrna est)
      mkdir ${i}_random
      cd ${i}_random
      ls -1S /mnt/scratch/hg/mm3/contigs/chr*_random_*.fa > genome.lst
      ls -1 /mnt/scratch/hg/mrna.133/Mus_musculus/$i.fa > mrna.lst
      cp -p ~/lastMm/bed/$i/gsub .
      #-- Edit the gsub so it doesn't say -mask=lower!!! 
      # _randoms are all lower right now -- I'm running this before masking.
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
      cd ..
    end 
    # para try,check,push,check... in each ${i}_random directory.

    # Now run the "Process..." and "Load..." steps above, but for *_random


CREATE REFSEQ GENES TRACK (DONE 02/09/03)
    # Load the refSeq mRNA
    ssh hgwdev
    mkdir -p /gbdb/mm3/mrna.133
    ln -s /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.fa \
      /gbdb/mm3/mrna.133
    hgLoadRna new mm3
    hgLoadRna add -type=refSeq mm3 /gbdb/mm3/mrna.133/refSeq.fa \
      /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.ra

    # Produce refGene, refPep, refMrna, and refLink tables as so:
    # Get the proteins:
    ssh kkstore
    cd ~/mm/bed/refSeq
    wget ftp://ftp.ncbi.nih.gov/refseq/M_musculus/mRNA_Prot/mouse.faa.gz
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc
    gunzip mouse.faa.gz
    ssh hgwdev
    cd ~/mm/bed/refSeq
    hgRefSeqMrna mm3 \
      /gbdb/mm3/mrna.133/refSeq.fa \
      /cluster/store2/mrna.133/refSeq/org/Mus_musculus/refSeq.ra \
      all_refSeq.psl loc2ref mouse.faa mim2loc
    # Don't worry about the "No gene name" errors

    # Add RefSeq status info
    hgRefSeqStatus -mouse mm3 loc2ref


REFFLAT (DONE 02/25/03)
    # create precomputed join of refFlat and refGene:
    echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql mm3


LOAD MRNA DATA (DONE 02/09/03)
    ssh hgwdev
    ln -s /cluster/store2/mrna.133/org/Mus_musculus/mrna.fa /gbdb/mm3/mrna.133
    ln -s /cluster/store2/mrna.133/org/Mus_musculus/est.fa /gbdb/mm3/mrna.133
    hgLoadRna add -type=mRNA mm3 /gbdb/mm3/mrna.133/mrna.fa \
      /cluster/store2/mrna.133/org/Mus_musculus/mrna.ra
    hgLoadRna add -type=EST mm3 /gbdb/mm3/mrna.133/est.fa \
      /cluster/store2/mrna.133/org/Mus_musculus/est.ra


LOADING MOUSE MM3 HUMAN BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/07/03)

    # Translate Penn State .lav files into sorted axt:
    ssh kkstore
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH"
    set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzHuman"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH"
    set tbl="blastzHuman"
    cd $base/pslChrom
    hgLoadPsl mm3 chr*_${tbl}.psl

MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/07/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH"
    set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzBestHuman"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH"
     set tbl="blastzBestHuman"
     cd $base/pslBest
     hgLoadPsl mm3 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm3/axtBestHg13
     cd /gbdb/mm3/axtBestHg13
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm3/axtBestHg13/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm3 < axtInfoInserts.sql

MAKING THE HUMAN AXTTIGHT FROM AXTBEST (DONE 03/07/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd ~/mm3/bed/blastz.hg13.2003-03-06-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHuman.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/mm3/bed/blastz.hg13.2003-03-06-ASH/pslTight
    hgLoadPsl mm3 chr*_blastzTightHuman.psl


RELOADING HUMAN BEST AND ALL TRACKS WITH HG15 (WERE HG13) (DONE 2003-09-22 kate)
    # NOTE: these are from swapped alignments from human browser   
    # generated by kent & matt.  Tight has already been done.
    ssh hgwdev
    cd /cluster/data/mm3/bed/blastz.hg15.swap/axtBest
cat << 'EOF' > makePsl.csh
    mkdir -p ../pslBest
    foreach f (chr*.axt.gz)
        set c = $f:r:r
        echo $c
        gunzip -c $f | axtToPsl stdin \
                        ../S1.len ../S2.len ../pslBest/${c}_blastzBestHuman.psl
    end
'EOF'
    csh makePsl.csh >&! makePsl.log &
    tail -100f makePsl.log
    cd ../pslBest
    hgLoadPsl mm3 chr*_blastzBestHuman.psl

    cd ../axtChrom
cat << 'EOF' > makePsl.csh
    mkdir -p ../pslChrom
    foreach f (chr*.axt.gz)
        set c = $f:r:r
        echo $c
        gunzip -c $f | axtToPsl stdin \
                        ../S1.len ../S2.len ../pslChrom/${c}_blastzHuman.psl
    end
'EOF'
    csh makePsl.csh >&! makePsl.log &
    tail -100f makePsl.log
    cd ../pslChrom
    hgLoadPsl mm3 chr*_blastzHuman.psl
    

CREATING HUMAN/MOUSE CHAINS AND NET BASED ON BLASTZ ALIGNMENTS MM3 vs HG12
    # Do small cluster run on kkr1u00
    # Make sure that ~/oo points to the latest human and ~/mm to the 
    # latest mouse.
    # The little cluster run will probably take about 1 hours.
    ssh kkr1u00
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH
    mkdir axtChain
    cd axtChain
    mkdir run1
    cd run1
    ls -1S ../../axtChrom/*.axt.gz > input.lst
    echo '#LOOP' > gsub
    echo 'doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}' >> gsub
    echo '#ENDLOOP' >> gsub
    gensub2 input.lst single gsub spec
    para create spec
    echo '#!/bin/csh' > doChain
    echo 'gunzip -c $1 | axtChain stdin  ~/mm/mixedNib ~/oo/mixedNib $2 > $3' >> doChain
    chmod a+x doChain
    para shove

    # Do some sorting on the little cluster job on the file server
    # This will take about 20 minutes.  This also ends up assigning
    # a unique id to each member of the chain.
    ssh kkstore
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    rm run1/chain/*.chain

    # Load the chains into the database as so.  This will take about
    # 45 minutes.
    ssh hgwdev
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain
    cd chain
    foreach i (*.chain)
	set c = $i:r
	hgLoadChain mm3 ${c}_humanChain $i
	echo done $c
    end

    # Create the nets.  You can do this while the database is loading
    # This is fastest done on the file server.  All told it takes about
    # 40 minutes.
    ssh kkstore
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain
    # First do a crude filter that eliminates many chains so the
    # real chainer has less work to do.
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      chainPreNet $i ~/mm/chrom.sizes ~/oo/chrom.sizes ../preNet/$i
    end
    cd ..
    # Run the main netter, putting the results in n1.
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      chainNet $i -minSpace=1 ~/mm/chrom.sizes ~/oo/chrom.sizes ../n1/$n /dev/null
    end
    cd ..
    # Classify parts of net as syntenic, nonsyntenic etc.
    cat n1/*.net | netSyntenic stdin hNoClass.net

    # The final step of net creation needs the database.
    # Best to wait for the database load to finish if it
    # hasn't already.
    ssh hgwdev
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain
    netClass hNoClass.net mm3 hg13 mouse.net -tNewR=$HOME/mm/bed/linSpecRep -qNewR=$HOME/oo/bed/linSpecRep
    rm -r n1 hNoClass.net

    # Load the net into the database as so:
    netFilter -minGap=10 mouse.net |  hgLoadNet mm3 humanNet stdin

    # Move back to the file server to create axt files corresponding
    # to the net.
    ssh kkstore
    cd /cluster/store2/mm.2003.02/mm3/bed/blastz.hg13.2003-03-06-ASH/axtChain
    mkdir ../axtNet
    netSplit mouse.net mouseNet
    cd mouseNet
    foreach i (*.net)
        set c = $i:r
	netToAxt $i ../chain/$c.chain ~/mm/mixedNib ~/oo/mixedNib ../../axtNet/$c.axt
	echo done ../axt/$c.axt
    end
    cd ..
    rm -r mouseNet

    # At this point there is a blastz..../axtNet directory that should
    # be used in place of the axtBest for making the human/mouse
    # conservation track and for loading up the downloads page.


LOADING MOUSE MM3 RAT BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 03/07/03)

    # Translate Penn State .lav files into sorted axt:
    ssh kkstore
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH"
    set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set seq2_dir="/cluster/store4/rn2/mixedNib/"
    set tbl="blastzRat"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtSort stdin $out
      popd
    end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r:r
       axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH"
    set tbl="blastzRat"
    cd $base/pslChrom
    hgLoadPsl mm3 chr*_${tbl}.psl

MAKING THE BLASTZBESTRAT TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/07/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH"
    set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set seq2_dir="/cluster/store4/rn2/mixedNib/"
    set tbl="blastzBestRat"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.rn2.2003-03-07-ASH"
     set tbl="blastzBestRat"
     cd $base/pslBest
     hgLoadPsl mm3 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm3/axtBestRn2
     cd /gbdb/mm3/axtBestRn2
     rm -f *
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm3/axtBestRn2/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('rn2','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm3 < axtInfoInserts.sql

MAKING THE RAT AXTTIGHT FROM AXTBEST (DONE 03/07/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd ~/mm3/bed/blastz.rn2.2003-03-07-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightRat.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/mm3/bed/blastz.rn2.2003-03-07-ASH/pslTight
    hgLoadPsl mm3 chr*_blastzTightRat.psl


CHAIN&NET MM3-RN3 AXT (SWAPPED FROM RN3-MM3 BLASTZ RUN): (DONE 9/22/03 angie)

    ssh kkr1u00
    cd /cluster/data/mm3/bed/blastz.rn3.swp
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/mm3/bed/blastz.rn3.swp/axtChrom/*.axt > input.lst
    cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << this line makes emacs coloring happy

    cat << '_EOF_' > doChain
#!/bin/csh
    axtFilter -notQ=chrUn_random $1 \
    | axtChain stdin /iscratch/i/mm3/mixedNib /iscratch/i/rn3/bothMaskedNibs $2 > $3
'_EOF_'
    # << this line makes emacs coloring happy
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try
    para push # ... etc ...
#Completed: 37 of 37 jobs
#CPU time in finished jobs:      10572s     176.20m     2.94h    0.12d  0.000 y
#IO & Wait Time:                  2786s      46.43m     0.77h    0.03d  0.000 y
#Average job time:                 361s       6.02m     0.10h    0.00d
#Longest job:                      609s      10.15m     0.17h    0.01d
#Submission to last job:          1098s      18.30m     0.30h    0.01d

    # now on the cluster server, sort chains
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # these steps take ~20 minutes
    # optionally: rm run1/chain/*.chain

    # Create the nets.  You can do this while the database is loading
    # This is fastest done on the file server.  All told it takes about
    # 40 minutes.
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain
    # First do a crude filter that eliminates many chains so the
    # real chainer has less work to do.
    mkdir preNet
    cd chain
    foreach i (*.chain)
      echo preNetting $i
      chainPreNet $i /cluster/data/mm3/chrom.sizes \
        /cluster/data/rn3/chrom.sizes ../preNet/$i
    end
    cd ..

    # Run the main netter, putting the results in n1.
    mkdir n1 
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      chainNet $i -minSpace=1 /cluster/data/mm3/chrom.sizes \
        /cluster/data/rn3/chrom.sizes ../n1/$n /dev/null
    end
    cd ..
    # Classify parts of net as syntenic, nonsyntenic etc.
    cat n1/*.net | netSyntenic stdin hNoClass.net

    # The final step of net creation needs the database.
    # Best to wait for the database load to finish if it
    # hasn't already.
    ssh hgwdev
    cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain
    netClass hNoClass.net mm3 rn3 mouse.net \
      -tNewR=/cluster/data/mm3/bed/linSpecRep \
      -qNewR=/cluster/data/rn3/bed/linSpecRep
    rm -r n1 hNoClass.net

    # Move back to the file server to create axt files corresponding
    # to the net.  Use netToAxt -maxGap=300 so we can pass nets to multiz.
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain
    mkdir ../axtNet300
    netSplit mouse.net mouseNet
    cd mouseNet
    foreach i (*.net)
        set c = $i:r
	netToAxt -maxGap=300 $i ../chain/$c.chain /cluster/data/mm3/mixedNib \
          /cluster/data/rn3/softNib ../../axtNet300/$c.axt
	echo done ../axt/$c.axt
    end
    cd ..
    rm -r mouseNet

    # Make maf's for multiz
    cd /cluster/data/mm3/bed/blastz.rn3.swp
    mkdir maf300
    foreach i (axtNet300/chr*.axt)
      set maf = maf300/$i:t:r.mr.maf.unfixed
      axtToMaf $i \
            /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=rn3.
      /cluster/bin/scripts/fixmaf.pl < $maf > $maf:r
    end
    rm maf300/*.unfixed


REGENERATE MAF FROM MOUSE-RAT NET FOR MULTIZ (DONE 10/27/03 angie)
    # Penn State complained about the drop in coverage -- so omit the 
    # -maxGap=300 in netToAxt and try again.  
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn3.swp/axtChain
    mkdir ../axtNet
    netSplit mouse.net mouseNet
    cd mouseNet
    foreach i (*.net)
        set c = $i:r
        netToAxt $i ../chain/$c.chain /cluster/data/mm3/mixedNib \
          /cluster/data/rn3/softNib ../../axtNet/$c.axt
        echo done ../axt/$c.axt
    end
    cd ..
    rm -r mouseNet

    # Make maf's for multiz
    cd /cluster/data/mm3/bed/blastz.rn3.swp
    mkdir maf
    foreach i (axtNet/chr*.axt)
      set maf = maf/$i:t:r.mr.maf
      axtToMaf $i \
            /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=rn3.
    end


REGENERATE MAF FROM MOUSE-RAT AXTBEST FOR MULTIZ (DONE 10/29/03 angie)
    # Sounds like Penn State's desired coverage needs axtBest not net.
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn3.swp
    mkdir mafBest
    foreach i (axtBest/chr*.axt)
      set maf = mafBest/$i:t:r.mr.maf
      axtToMaf $i \
            /cluster/data/mm3/chrom.sizes /cluster/data/rn3/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=rn3.
    end


CREATE MAF FROM MOUSE-HUMAN NET FOR MULTIZ (DONE 9/22/03 angie)
    # Use netToAxt -maxGap=300 so we can pass nets to multiz.
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.hg15.swap/axtChain
    mkdir ../axtNet300
    cd net
    foreach i (chr*.net)
        set c = $i:r
        netToAxt -maxGap=300 $i ../chain/$c.chain /cluster/data/mm3/mixedNib \
          /cluster/data/hg15/nib ../../axtNet300/$c.axt
        echo done ../axt/$c.axt
    end
    cd ..

    # Make maf's for multiz
    cd /cluster/data/mm3/bed/blastz.hg15.swap
    mkdir maf300
    foreach i (axtNet300/chr*.axt)
      set maf = maf300/$i:t:r.mh.maf.unfixed
      axtToMaf $i \
            /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=hg15.
      /cluster/bin/scripts/fixmaf.pl < $maf > $maf:r
    end
    rm maf300/*.unfixed


REGENERATE MAF FROM MOUSE-HUMAN NET FOR MULTIZ (DONE 10/28/03 angie)
    # Penn State complained about the drop in coverage -- so omit the 
    # -maxGap=300 in netToAxt and try again.  
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.hg15.swap/axtChain
    mkdir ../axtNet
    cd net
    foreach i (chr*.net)
        set c = $i:r
        netToAxt $i ../chain/$c.chain /cluster/data/mm3/mixedNib \
          /cluster/data/hg15/nib ../../axtNet/$c.axt
        echo done ../axtNet/$c.axt
    end
    cd ..

    # Make maf's for multiz
    cd /cluster/data/mm3/bed/blastz.hg15.swap
    mkdir maf
    foreach i (axtNet/chr*.axt)
      set maf = maf/$i:t:r.mh.maf
      axtToMaf $i \
            /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=hg15.
    end


REGENERATE MAF FROM MOUSE-HUMAN BEST FOR MULTIZ (DONE 10/29/03 angie)
    # Sounds like Penn State's desired coverage needs axtBest not net.
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.hg15.swap
    mkdir mafBest
    foreach i (axtBest/chr*.axt.gz)
      set maf = mafBest/$i:t:r:r.mh.maf
      gunzip -c $i | axtToMaf stdin \
            /cluster/data/mm3/chrom.sizes /cluster/data/hg15/chrom.sizes \
            $maf -tPrefix=mm3. -qPrefix=hg15.
    end


RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (DONE 9/23/03 angie)

    ssh kkstore
    mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23
    cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23

    cat << "EOF" > ./doMultiz.csh
    foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/maf300/chr*.maf)
      set c = $mh:t:r:r
      set mr = /cluster/data/mm3/bed/blastz.rn3.swp/maf300/$c.mr.maf
      echo multiz-ing $c
      /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf
    end
"EOF"
    # << this line makes emacs coloring happy
    csh -ef ./doMultiz.csh >& doMultiz.log &
    tail -f doMultiz.log 
    # setup up external files for database to reference
    ssh hgwdev
    mkdir /gbdb/mm3/multizHg15Rn3
    cd /gbdb/mm3/multizHg15Rn3
    foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-09-23/*.maf)
        ln -s $f .
    end

    # load into database
    /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3

    # get featureBits coverage:
    featureBits mm3 multizHg15Rn3
    # 1738562766 bases of 2505900260 (69.379%) in intersection


RE-RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (DONE 10/28/03 angie)

    ssh kkstore
    mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28
    cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28

    cat << "EOF" > ./doMultiz.csh
    foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/maf/chr*.maf)
      set c = $mh:t:r:r
      set mr = /cluster/data/mm3/bed/blastz.rn3.swp/maf/$c.mr.maf
      echo multiz-ing $c
      /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf
    end
"EOF"
    # << this line makes emacs coloring happy
    csh -ef ./doMultiz.csh >& doMultiz.log &
    tail -f doMultiz.log 

NOT DONE:
    # setup up external files for database to reference
    ssh hgwdev
    mkdir /gbdb/mm3/multizHg15Rn3NoMaxGap
    cd /gbdb/mm3/multizHg15Rn3NoMaxGap
    foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-28/*.maf)
        ln -s $f .
    end

    # load into database
    /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3NoMaxGap

    # get featureBits coverage:
    featureBits mm3 multizHg15Rn3NoMaxGap


RE-RUN 3-WAY ALIGNMENTS MOUSE/HUMAN/RAT USING MULTIZ (IN PROGRESS 10/29/03 angie)

    ssh kkstore
    mkdir /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29
    cd /cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29

    cat << "EOF" > ./doMultiz.csh
    foreach mh (/cluster/data/mm3/bed/blastz.hg15.swap/mafBest/chr*.maf)
      set c = $mh:t:r:r
      set mr = /cluster/data/mm3/bed/blastz.rn3.swp/mafBest/$c.mr.maf
      echo multiz-ing $c
      /cluster/bin/penn/multiz.7.23 $mh $mr - > $c.mhr.maf
    end
"EOF"
    # << this line makes emacs coloring happy
    csh -ef ./doMultiz.csh >& doMultiz.log &
    tail -f doMultiz.log 

NOT DONE:
    # setup up external files for database to reference
    ssh hgwdev
    mkdir /gbdb/mm3/multizHg15Rn3Best
    cd /gbdb/mm3/multizHg15Rn3Best
    foreach f (/cluster/data/mm3/bed/multiz.mm3hg15rn3.2003-10-29/*.maf)
        ln -s $f .
    end

    # load into database
    /cluster/bin/i386/hgLoadMaf -warn mm3 multizHg15Rn3Best

    # get featureBits coverage:
    featureBits mm3 multizHg15Rn3Best


RE-AXT-ING MOUSE MM3 SELF BLASTZ ALIGNMENTS FROM PENN STATE: (DONE 09/25/03)

    # Translate Penn State .lav files into sorted axt:
    ssh kksilo
    set base="/cluster/data/mm3/bed/blastz.mm3.redo.2003-09-23"
    set seq1_dir="/cluster/data/mm3/mixedNib"
    set seq2_dir="/cluster/data/mm3/mixedNib"
    set tbl="blastzMouse"
    cd $base
    mkdir -p axtChrom
    foreach c (lav/*)
      pushd $c
      set chr=$c:t
      set out=$base/axtChrom/$chr.axt
      echo "Translating $chr lav to $out"
      cat `ls -1 *.lav | sort -g` \
        | lavToAxt stdin $seq1_dir $seq2_dir stdout \
        | axtDropOverlap stdin /cluster/data/mm3/chrom.sizes \
            /cluster/data/mm3/chrom.sizes stdout \
        | axtSort stdin $out
      popd
    end
    # chrUn_random ran out of mem; re-run in 2 passes:
    foreach chrdir (lav/chrUn_random)
      set chr=$chrdir:t
      echo two-pass translating $chr lav to axtChrom/$chr.axt
      foreach d ($chrdir/*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropOverlap stdin /cluster/data/mm3/chrom.sizes \
            /cluster/data/mm3/chrom.sizes stdout \
        | axtSort stdin $smallout
      end
      cat `ls -1 $chrdir/*.axt | sort -g` \
        > axtChrom/$chr.axt
    end
    rm lav/chr*/*.axt*

# Did not load tables for this one -- they're very big, and low demand.
TODO    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load tables
    ssh hgwdev
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH"
    set tbl="blastzMouse"
    cd $base/pslChrom
    hgLoadPsl mm3 chr*_${tbl}.psl

MAKING THE BLASTZBESTMOUSE TRACK FROM PENN STATE MM3 AXT FILES (DONE 03/10/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh kkstore
    set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH"
    set seq1_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set seq2_dir="/cluster/store2/mm.2003.02/mm3/mixedNib/"
    set tbl="blastzBestMouse"
    cd $base
    mkdir -p axtBest pslBest
    foreach chrdir (lav/chr*)
      set chr=$chrdir:t
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # If a chromosome has so many alignments that axtBest runs out of mem,
    # run axtBest in 2 passes to reduce size of the input to final axtBest:
    foreach chrdir (lav/chr{1,2,3,4,6,7,X})
      set chr=$chrdir:t
      echo two-pass axtBesting $chr
      foreach d ($chrdir/*.lav)
        set smallout=$d.axt
        lavToAxt $d $seq1_dir $seq2_dir stdout \
        | axtDropSelf stdin stdout \
        | axtSort stdin $smallout
      end
      foreach a ($chrdir/*.axt)
        axtBest $a $chr $a:r.axtBest
      end
      cat `ls -1 $chrdir/*.axtBest | sort -g` \
        > $chrdir/$chr.axtBestPieces
      axtBest $chrdir/$chr.axtBestPieces $chr axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    rm lav/chr*/*.axt*

    # Load tables
     ssh hgwdev
     set base="/cluster/store2/mm.2003.02/mm3/bed/blastz.mm3.2003-03-06-ASH"
     set tbl="blastzBestMouse"
     cd $base/pslBest
     hgLoadPsl mm3 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/mm3/axtBestMm3
     cd /gbdb/mm3/axtBestMm3
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/mm3/axtBestMm3/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('mm3','Best Mouse Mouse','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql mm3 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql mm3 < axtInfoInserts.sql

MAKING THE MOUSE SELF AXTTIGHT FROM AXTBEST (DONE 03/10/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh kkstore
    cd ~/mm3/bed/blastz.mm3.2003-03-06-ASH/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightMouse.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/mm3/bed/blastz.mm3.2003-03-06-ASH/pslTight
    hgLoadPsl mm3 chr*_blastzTightMouse.psl


PRODUCING GENSCAN PREDICTIONS (TODO - REDO)
    
    ssh kkstore
    mkdir -p ~/mm3/bed/genscan
    cd ~/mm3/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir -p gtf pep subopt
       
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    cd ~/mm3/bed/genscan
    ls -1S /cluster/store2/mm.2003.02/mm3/?{,?}/chr*_*/chr*_*.fa.masked \
      > genome.list
    # Create template file, gsub, for gensub2.  For example (3-line file):
    # Note: I changed this to 1800000 in this build because some jobs were 
    # taking so long I thought they had crashed.
#LOOP
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=1800000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push
    # Issue either one of the following two commands to check the
    # status of the cluster and your jobs, until they are done.
    parasol status
    para check
    # If there were out-of-memory problems (run "para problems"), then 
    # re-run those jobs by hand but change the -window arg from 2400000
    # to 1200000.  
    # chr8_24

    # Convert these to chromosome level files as so:     
    ssh kkstore
    cd ~/mm3/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/chr*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/chr*.bed > \
      /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/mm3/bed/genscan
    ldHgGene mm3 genscan genscan.gtf
    hgPepPred mm3 generic genscanPep genscan.pep
    hgLoadBed mm3 genscanSubopt genscanSubopt.bed > /dev/null


TWINSCAN GENE PREDICTIONS (Done 2003-05-20 RELOADED -gtf 2004-04-02 kate)
    mkdir -p ~/mm3/bed/twinscan
    cd ~/mm3/bed/twinscan

    mkdir -p ~/mm3/bed/twinscan/gtf
    mkdir -p ~/mm3/bed/twinscan/ptx
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X)
      cd ~/mm3/bed/twinscan/gtf
      wget http://genes.cs.wustl.edu/mouse/5-01-03/gtf/chr$c.gtf
      cd ~/mm3/bed/twinscan/ptx
      wget http://genes.cs.wustl.edu/mouse/5-01-03/ptx/chr$c.ptx
    end

    cd ~/mm3/bed/twinscan/gtf
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X)
      f=chr$c
      cat chr$c.gtf | sed "s/^[0-9]*	//"  | sed "s/^/$f	/" > foo
      mv foo chr$c.gtf
    end
    ldHgGene mm3 twinscan chr*.gtf -gtf
        # 24282 gene predictions

    cd ~/mm3/bed/twinscan/ptx
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X)
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1/;' < \
        chr$c.ptx | sed  "s/^>chr.*/&.a/" > chr$c-fixed.fa
    end
    hgPepPred mm3 generic twinscanPep chr*-fixed.fa

NCBI GENE MODELS (TODO)

    mkdir -p ~/mm3/bed/ncbiGenes
    cd ~/mm3/bed/ncbiGenes
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_genes.gtf.gz
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/protein.fa.gz
    gunzip chr_genes.gtf.gz
    gunzip protein.fa.gz
    - Process the .gtf and .fa together to join IDs
    ../../jkStuff/mungeNCBIids chr_genes.gtf protein.fa |& uniq
    ldHgGene mm3 ncbiGenes chr_genes-fixed.gtf
    hgPepPred mm3 generic ncbiPep protein-fixed.fa

NCBI GENOMESCAN MODELS (TODO)

    mkdir -p ~/mm3/bed/genomeScan
    cd ~/mm3/bed/genomeScan
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/maps/chr_GenomeScan.gtf.gz
    # Remove the ".1" at the end of transcript_id's:
    gunzip -c chr_GenomeScan.gtf.gz | \
      perl -wpe 's/transcript_id "([^\"]+)\.1"/transcript_id "$1"/' > \
      chr_GenomeScan-fixed.gtf
    ldHgGene mm3 genomeScan chr_GenomeScan-fixed.gtf
    wget ftp://ftp.ncbi.nih.gov/genomes/M_musculus/MGSCv3_Release1/protein/GS_prot.fsa.gz
    hgPepPred mm3 generic genomeScanPep GS_prot.fsa


GET FRESH MRNA SEQUENCE FROM GENBANK

This will create a genbank.133 directory containing compressed
GenBank flat files and a mrna.133 containing unpacked sequence
info and auxiliary info in a relatively easy to parse (.ra)
format.

  o - Point your browser to ftp://ftp.ncbi.nih.gov/genbank and
      look at the README.genbank.  Figure out the current release number
      (which is 133).
  o - Consider deleting one of the older genbank releases.  It''s
      good to at least keep one previous release though.
  o - Where there is space make a new genbank directory.  Create a
      symbolic link to it:
          mkdir /cluster/store2/genbank.133
          ln -s /cluster/store2/genbank.133 ~/genbank
      cd ~/genbank
  o - ftp ftp.ncbi.nih.gov  (do anonymous log-in).  Then do the
      following commands inside ftp:
           cd genbank
           prompt
           mget gbpri* gbrod* gbv* gbsts* gbest* gbmam* gbinv* gbbct* gbhtc* gbpat* gbphg* gbpln*
      This will take at least 2 hours.
  o - Make the refSeq subdir and download files:
       mkdir -p /cluster/store2/mrna.133/refSeq
       cd /cluster/store2/mrna.133/refSeq

  o - ftp ftp.ncbi.nih.gov  (do anonymous log-in).  Then do the
      following commands inside ftp:
           cd refseq/cumulative
           prompt
           mget *.gz
  o - Unpack this into species-specific fa files and get extra info with:
      cd /cluster/store2/mrna.133/refSeq
      gunzip -c rscu.gbff.Z  | \
        gbToFaRa -byOrganism=org ../anyRna.fil refSeq.fa refSeq.ra refSeq.ta stdin

  o - ssh kkstore
      cd /cluster/store2/mrna.133

# Make the RNAs
        gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa -byOrganism=org anyRna.fil mrna.fa mrna.ra mrna.ta stdin

# Make the ESTs
        gunzip -c /cluster/store2/genbank.133/gbest*.gz | \
        gbToFaRa anyRna.fil est.fa est.ra est.ta stdin -byOrganism=org

# Make the nonhuman RNAs
        gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa humanXenoRna.fil humanXenoRna.fa humanXenoRna.ra humanXenoRna.ta stdin

# Make the nonMouse RNAs
        gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa mouseXenoRna.fil mouseXenoRna.fa mouseXenoRna.ra mouseXenoRna.ta stdin

# Make the nonRat RNAs
        gunzip -c /cluster/store2/genbank.133/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa ratXenoRna.fil ratXenoRna.fa ratXenoRna.ra ratXenoRna.ta stdin

# Make the nonhuman ESTs
        gunzip -c /cluster/store2/genbank.133/gbest*.gz | \
        gbToFaRa humanXenoRna.fil humanXenoEst.fa humanXenoEst.ra humanXenoEst.ta stdin


PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 03/06/03)

    # Here you align non-mouse mRNAs against the masked genome on the
    # cluster you set up during the previous step.
    # Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from 
    # Genbank into /cluster/store2/genbank.133 and unpacked by organism into 
    # /cluster/store2/mrna.133/org. 

    # Set up cluster run more or less as so:
      ssh kk
      cd ~/mm3/bed
      mkdir xenoMrna
      cd xenoMrna
      ls -1S /scratch/hg/mm3/trfFa/* > genome.lst
      cp -R /cluster/store2/mrna.133/org /mnt/scratch/hg/mrna.133
    # The below ls command fails when you have too many files so skip it and 
    # instead run the find command after it.
    #      ls -1S /mnt/scratch/hg/mrna.133/org/*/mrna.fa > allMrna.lst
      find /mnt/scratch/hg/mrna.133/org -name mrna.fa -ls \
        | awk '{print $7,$11}' | grep -v /Mus_musculus/ \
        | sort -gr | awk '{print $2}' \
        >  allMrna.lst
    # Put the first line of allMrna.lst into 1.org, the second line into 
    # 2.org, and so forth:
      foreach n (1 2 3 4 5 6)
        head -$n allMrna.lst | tail -1 > $n.org
      end
    # After the 6th line just leave the rest in 7.org.
      tail +7 allMrna.lst > 7.org
    # Then
      ls -1 *.org > mrna.lst
      cp ~/lastMm/bed/xenoMrna/gsub .
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
      para try
      para check
    # If all looks well do
      para push

    # Sort xeno mRNA alignments as so:
       ssh kkstore
       cd ~/mm3/bed/xenoMrna
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.25
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoMrna.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

    # Load into database as so:
       ssh hgwdev
       cd ~/mm3/bed/xenoMrna
       hgLoadPsl mm3 xenoMrna.psl -tNameIx

    # Make the xenoRna file
       # Make a /gbdb symlink for the .fa (not .ra)
       cd /gbdb/mm3/mrna.133
       ln -s /cluster/store2/mrna.133/mouseXenoRna.fa mouseXenoRna.fa
       hgLoadRna add -type=xenoRna mm3 /gbdb/mm3/mrna.133/mouseXenoRna.fa \
         /cluster/store2/mrna.133/mouseXenoRna.ra

PRODUCING ESTORIENTINFO TABLE (DONE - 2/19/03 MATT; _randoms DONE 03/06/03)

This table is needed for proper orientation of ESTs in the
browser.  Many will appear on the wrong strand without it.
This involves a cluster run.  First load the EST psl files
as so:
     ssh kkstore
     cd ~/mm3/bed/est
     pslSortAcc nohead contig /cluster/store2/temp contig.psl
     mkdir /mnt/scratch/hg/mm3/est
     cp -r contig /mnt/scratch/hg/mm3/est

Wait for these to finish.
     cd ..
     mkdir estOrientInfo
     cd estOrientInfo
     mkdir ei
     ls -1S /mnt/scratch/hg/mm3/est/contig/* > psl.lst
     cp ~/lastMm/bed/estOrientInfo/gsub .
Update gsub to refer to mouse contig sequence currently on
/scratch, and mouse ESTs on /scratch.
     gensub2 psl.lst single gsub spec
     para create spec
Then run the  job on the cluster
     ssh kk
     cd ~/mm3/bed/estOrientInfo
     para try
     sleep 60
     para check
If things look good
     para push

Wait for this to finish then
     liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab
Load them into database as so:
     ssh hgwdev
     cd ~/mm3/bed/estOrientInfo
     hgLoadBed mm3 estOrientInfo estOrientInfo.bed \
       -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql

PRODUCING MRNAORIENTINFO TABLE (DONE 03/06/03)
    ssh kkstore
    cd ~/mm3/bed/mrna
    pslSortAcc nohead contig /cluster/store2/temp contig.psl
    mkdir /mnt/scratch/hg/mm3/mrna
    cp -r contig /mnt/scratch/hg/mm3/mrna
    mkdir -p ~/mm3/bed/mrnaOrientInfo/oi
    cd ~/mm3/bed/mrnaOrientInfo
    ls -1S /mnt/scratch/hg/mm3/mrna/contig/* > psl.lst
    cp ~/lastMm/bed/mrnaOrientInfo/gsub .
    echo placeholder > single
    gensub2 psl.lst single gsub spec

    ssh kk
    cd ~/mm3/bed/mrnaOrientInfo
    para create spec
    para try, para check, para push, para check,...
    liftUp mrnaOrientInfo.bed ../../jkStuff/liftAll.lft warn oi/*.tab

    ssh hgwdev
    cd ~/mm3/bed/mrnaOrientInfo
    hgLoadBed mm3 mrnaOrientInfo mrnaOrientInfo.bed \
       -sqlTable=/cluster/home/kent/src/hg/lib/mrnaOrientInfo.sql


CREATE RNACLUSTER TABLE (DONE 03/06/03)
    # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are 
    # made already (see above).
    ssh hgwdev
    mkdir -p ~/mm3/bed/rnaCluster/chrom
    cd ~/mm3/bed/rnaCluster
    foreach i (~/mm3/?{,?})
      foreach f ($i/chr*.fa)
        set c = $f:t:r
        clusterRna mm3 /dev/null chrom/$c.bed -chrom=$c
        echo done $c
      end
    end
    hgLoadBed mm3 rnaCluster chrom/*.bed


PRODUCING TETRAODON FISH ALIGNMENTS (TODO)

o - Download sequence from ... and put it on the cluster local disk
    at
       /scratch/hg/fish
o - Do fish/mouse alignments.
       ssh kk
       cd ~/mm/bed
       mkdir blatFish
       cd blatFish
       mkdir psl
       ls -1S /scratch/hg/fish/* > fish.lst
       ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst
       cp ~/lastMm/blatFish/gsub .
       gensub2 mouse.lst fish.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl | liftUp -type=.psl stdout ~/mm/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/tmp stdin
o - Copy to hgwdev:/scratch.  Rename to correspond with tables as so and 
    load into database:
       ssh hgwdev
       cd ~/mm/bed/blatFish/chrom
       foreach i (*.psl)
           set r = $i:r
           mv $i ${r}_blatFish.psl
       end
       hgLoadPsl mm3 *.psl
       #*** Make /gbdb/mm3/fish_seq15jun2001 links
       hgLoadRna addSeq mm3 /gbdb/mm3/fish_seq15jun2001/*.fa

# PRODUCING SQUIRT ALIGNMENTS  
    ssh kkstore
    mkdir -p ~/mm3/bed/blatCi1
    cd ~/mm3/bed/blatCi1
    ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst
    ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst

    rm -rf psl
    foreach ctg (`cat mouse.lst`)
      mkdir -p psl/$ctg:t:r
    end
    # get gsub2D from someplace
    gensub2 mouse.lst squirt.lst gsub2D spec

    ssh kk
    cd ~/mm3/bed/blatCi1
    para create spec
    ....
    # When cluster run is done, sort alignments:
    ssh eieio
    cd ~/mm3/bed/blatCi1
    rm -rf /tmp/$LOGNAME
    mkdir /tmp/$LOGNAME
    pslSort dirs raw.psl /tmp/$LOGNAME psl/*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.05
    liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/mm3/bed/blatCi1/chrom
    rm -f chr*_blatCi1.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatCi1.psl
    end
    hgLoadPsl mm3 *.psl

    # Make squirt /gbdb/ symlink 
    mkdir /gbdb/mm3/squirtSeq
    cd /gbdb/mm3/squirtSeq
    ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta

PRODUCING FUGU FISH ALIGNMENTS (DONE 03/13/03)

    # sequence was previously downloaded to /cluster/store3/fuguSeq/ from
    #   ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z
    #   ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z
    # Load it up on /mnt/scratch:
    ssh kkstore
    mkdir -p /mnt/scratch/hg/fugu
# Next time, use /cluster/store3/fuguSeq/split2.5Mb !!!
    cp -p /cluster/store3/fuguSeq/split/*.fa /mnt/scratch/hg/fugu/

    ssh kk
    mkdir ~/mm/bed/blatFugu
    cd ~/mm/bed/blatFugu
    ls -1S /mnt/scratch/hg/fugu/* > fugu.lst
    ls -1S /scratch/hg/mm3/trfFa/* > mouse.lst
    # Create dir structure for run
    mkdir psl
    foreach ctg (`ls -1 /scratch/hg/mm3/trfFa/`)
      mkdir psl/$ctg:t:r
    end
    cp ~/lastMm/bed/blatFugu/gsub .
    gensub2 mouse.lst fugu.lst gsub spec
    para create spec
    para try
    # Make sure jobs are going ok with para check.  Then
    para push
    # wait about 2 hours and do another
    para push
    # do para checks and if necessary para pushes until done or use para shove.
    ssh kkstore
    cd ~/mm/bed/blatFugu
    pslCat -dir psl/* \
      | liftUp -type=.psl stdout ~/mm3/jkStuff/liftAll.lft warn stdin \
      | pslSortAcc nohead chrom /cluster/store2/tmp stdin
    # load into database:
    ssh hgwdev
    cd ~/mm3/bed/blatFugu/chrom
    foreach i (*.psl)
      set r = $i:r
      mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl mm3 *.psl
    mkdir -p /gbdb/mm3/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta /gbdb/mm3/fuguSeq/
    hgLoadRna addSeq mm3 /gbdb/mm3/fuguSeq/fugu_v3_mask.fasta


LOAD SOFTBERRY GENES (DONE 02/04/03)
     cd /cluster/store2/mm.2003.02/mm3/bed
     mkdir softberry
     cd softberry
     wget ftp://www.softberry.com/pub/SC_MOU_FEB03/Softb_mouse_gff_f03.tar.gz
     gunzip -c Softb_mouse_gff_f03.tar.gz | tar xvf -
     ldHgGene mm3 softberryGene chr*.gff
     hgPepPred mm3 softberry *.protein
     hgSoftberryHom mm3 *.protein

LOAD GENEID GENES (DONE 03/18/03 RELOADED -gtf 2004-04-02 kate)
    mkdir -p ~/mm3/bed/geneid/download
    cd ~/mm3/bed/geneid/download
    set webroot = \
      http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/geneid_v1.1
    foreach f (~/mm3/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget $webroot/$chr.gtf
      wget $webroot/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene mm3 geneid download/*.gtf -gtf
    # 34050 gene predictions
    runGeneCheck .
     # 34 noStart
     # 33 badFrame
     # 13 noStop
     # 11 gap
    hgPepPred mm3 generic geneidPep download/*-fixed.prot

SGP GENE PREDICTIONS (4/17/03 KRR)
    mkdir -p ~/mm3/bed/sgp/download
    cd ~/mm3/bed/sgp/download
    #foreach f (~/mm3/?{,?}/chr?{,?}{,_random}.fa)
    # Doesn't have chr_random entries KRR
    foreach f (~/mm3/?{,?}/chr?{,?}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/SGP/humangp20021114/$chr.gtf
      wget http://genome.imim.es/genepredictions/M.musculus/mmFeb2003/SGP/humangp20021114/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene mm3 sgpGene download/*.gtf -exon=CDS
    # NOTE: perhaps need to use -gtf instead of -exon=CDS
    # check that stop codon is included (txEnd == last exonEnds)
    hgPepPred mm3 generic sgpPep download/*-fixed.prot


# TIGR GENE INDEX (DONE 2003-11-10 kate)
    ssh kkstore
    mkdir -p /cluster/data/mm3/bed/tigr
    cd /cluster/data/mm3/bed/tigr
    wget ftp://ftp.tigr.org/pub/data/tgi/Mus_musculus/TGI_track_MouseGenome_Feb2003.tgz
    tar xvfz TGI*.tgz
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (mouse cow human pig rat)
      echo $o
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ssh hgwdev
    cd /cluster/data/mm3/bed/tigr
    ldHgGene -exon=TC mm3 tigrGeneIndex *.gff
       # 286333 transcripts in 1180977 lines in 105 files
    gzip *TCs *.gff


LOAD STS MAP (TODO)
     - login to hgwdev
      cd ~/mm/bed
      mm3 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed
      - Enter database with "mm3" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type

LOAD MGI IDs (TODO)
      - The Locuslink ID to MGI IDs converstion data file,
        LL2MGI.txt, from Jackson Lab should be found under
        ~/mm/bed/refSeq
      - login to hgwdev
      
      cd ~/mm/bed/refSeq
      mm3 < ~/src/hg/lib/mgiID.sql
      - Enter database with "mm3" command.
      - At mysql> prompt type in:
          load data local infile 'LL2MGI.txt' into table MGIid;
      - At mysql> prompt type
          quit

LOAD CHROMOSOME BANDS (TODO)
      - login to hgwdev
      cd /cluster/store2/mm.2003.02/mm3/bed
      mkdir cytoBands
      cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands
      mm3 < ~/src/hg/lib/cytoBand.sql
      Enter database with "mm3" command.
      - At mysql> prompt type in:
          load data local infile 'cytobands.bed' into table cytoBand;
      - At mysql> prompt type
          quit

LOAD MOUSEREF TRACK (TODO)
    First copy in data from kkstore to ~/mm/bed/mouseRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb mm3 mouseRef *.alignments

LOAD AVID MOUSE TRACK (TODO)
      ssh cc98
      cd ~/mm/bed
      mkdir avidMouse
      cd avidMouse
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (TODO)
      - ssh hgwdev
      - cd ~/mm/bed
      - mkdir snp
      - cd snp
      - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/mouse.b27.out.gz
      - Unpack.
        createBed < mouse.b27.out > snpNih.bed
        hgLoadBed mm3 snpNih snpNih.bed

LOAD CPGISSLANDS (DONE 03/06/03)
     ssh kkstore
     mkdir -p ~/mm3/bed/cpgIsland
     cd ~/mm3/bed/cpgIsland
     # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
     # copy the tar file to the current directory
     tar xvf cpg_dist.tar 
     cd cpg_dist
     gcc readseq.c cpg_lh.c -o cpglh.exe
     cd ..
     # cpglh.exe requires hard-masked (N) .fa's.  
     # There may be warnings about "bad character" for IUPAC ambiguous 
     # characters like R, S, etc.  Ignore the warnings.  
     foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
       set fout=$f:t:r:r.cpg
       ./cpg_dist/cpglh.exe $f > $fout
       echo Done with $fout
     end
     cp ~/lastMm/bed/cpgIsland/filter.awk .
     awk -f filter.awk chr*.cpg > cpgIsland.bed
     # Load into db
     ssh hgwdev
     cd ~/mm3/bed/cpgIsland
     hgLoadBed mm3 cpgIsland -tab -noBin \
       -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed

LOAD ENSEMBL GENES (DONE Matt June 6, 03) Peptides 2003-10-01 braney
     cd ~/mm3/bed
     mkdir ensembl
     cd ensembl

        Get the ensembl gene data from http://www.ensembl.org/
        Go to the Martview link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
        Page 3) Choose the "Structures" box. 
        Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.gunzip the file and name to mm3EnsGene.gtf.gz. Unzip it.

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     grep -v ^6_DR51 ensembl.gtf | grep -v _NT_ > unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
     ~matt/bin/addchr.pl unrandom.gtf ensGene.gtf
     ~matt/bin/fixEns.pl ensGene.gtf ensFixed.gtf 
     ldHgGene mm3 ensGene ensGene.gtf

o - Load Ensembl peptides:
        Get the ensembl protein data from http://www.ensembl.org/
        Go to the Martview link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl Genes choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Sequences" box. 
        Page 4) Choose Transcripts/Proteins and peptide Only as the ouput, choose text/fasta and gzip compression and then hit export. Name to mm3EnsGene.pep.gz

     If needed, substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     #subs -e mm3EnsGene.pep > /dev/null

     #~matt/bin/fixPep.pl mm3EnsGene.pep ensembl.pep
     #delete * at end of each protein
     sed "s/\*$//" mm3EnsGene.pep > ensembl.pep
     hgPepPred mm3 generic ensPep ensembl.pep

o - Load ensGtp table.
    # ensGtp associates geneId/transcriptId/proteinId for hgPepPred and 
    # hgKnownToSuper.  Use ensMart to create it as above, except:
    # Page 3) Choose the "Features" box. In "Ensembl Attributes", check 
    # Ensembl Gene ID, Ensembl Transcript ID, Ensembl Peptide ID.  
    # Choose Text, tab-separated as the output format.  Result name ensGtp.
    # Save file as ensGtp.txt.gz
    gunzip ensGtp.txt.gz
    hgsql hg16 < ~/kent/src/hg/lib/ensGtp.sql
    echo "load data local infile 'ensGtp.txt' into table ensGtp" | hgsql -N hg16


LOAD ENSEMBL ESTS (DONE June 6, 03)
     cd ~/mm3/bed
     mkdir ensemblEst
     cd ensemblEst

        Get the ensembl EST data from http://www.ensembl.org/
        Go to the Martview link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl ESTs choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.        
        Page 3) Choose the "Structures" box. 
        Page 4) Choose GTF as the ouput, choose gzip compression and then hit Export.gunzip the file and name to mm3EnsEst.gtf.gz. Unzip it.

# Ensembl handles random chromosomes differently than us, so we
# strip this data.  Fortunately it just loses a couple of genes.
     grep -v ^6_DR51 mm3EnsEst.gtf | grep -v _NT_ > unrandom.gtf

# Add "chr" to front of each line in the gene data gtf file to make 
# it compatible with ldHgGene 
     ~matt/bin/addchr.pl unrandom.gtf ensEst.gtf
     ~matt/bin/fixEns.pl ensEst.gtf ensFixed.gtf 
     ldHgGene mm3 ensEst ensEst.gtf

o - Load Ensembl peptides:
        Get the ensembl protein data from http://www.ensembl.org/
        Go to the Martview link
        Choose Mus musculus as the organism
        Follow this sequence through the pages:
        Page 1) Choose the Ensembl ESTs choice. Hit next.
        Page 2) Uncheck the "Limit to" box in the region choice. Then hit next.
        Page 3) Choose the "Sequences" box. 
        Page 4) Choose Transcripts/Proteins and Gene sequence Only as the ouput, choose text/fasta and gzip compression and then hit export. Name to mm3EnsEst.pep.gz

     If needed, substitute ENST for ENSP in ensPep with the program called subs
     edit subs.in to read: ENSP|ENST
     subs -e mm3EnsEst.pep > /dev/null

     ~matt/bin/fixPep.pl mm3EnsEst.pep ensembl.pep
     hgPepPred mm3 generic ensEstPep ensembl.pep

LOAD RNAGENES (TODO)
      - login to hgwdev
      - cd ~kent/src/hg/lib
      - mm3 < rnaGene.sql
      - cd /cluster/store2/mm.2003.02/mm3/bed
      - mkdir rnaGene
      - cd rnaGene
      - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz
      - gunzip *.gz
      - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff
      - hgRnaGenes mm3 chrom.gff

LOAD EXOFISH (TODO)
     - login to hgwdev
     - cd /cluster/store2/mm.2003.02/mm3/bed
     - mkdir exoFish
     - cd exoFish
     - mm3 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store2/mm.2003.02/mm3/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed mm3 exoFish exoFish.bed

LOAD MOUSE SYNTENY (TODO)
     - login to hgwdev.
     - cd ~/kent/src/hg/lib
     - mm3 < mouseSyn.sql
     - mkdir ~/mm/bed/mouseSyn
     - cd ~/mm/bed/mouseSyn
     # Put Deanna Church's (church@ncbi.nlm.nih.gov) email attatchment as
     # mouseSyn.txt
     - awk -f format.awk *.txt > mouseSyn.bed
     - delete first line of mouseSyn.bed
     - Enter database with "mm3" command.
     - At mysql> prompt type in:
          load data local infile 'mouseSyn.bed' into table mouseSyn


LOAD GENIE (TODO)
     mkdir -p ~/mm3/bed/genieAlt
     cd ~/mm3/bed/genieAlt
     wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz
     gunzip -c mgscv3.genie.gtf.tgz | tar xvf -
     ldHgGene mm3 genieAlt mgscv3.genie.gtf/chr*.gtf
     wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz
     gunzip -c mgscv3.aa.tgz | tar xvf -
     hgPepPred mm3 genie geniePep chr*.aa.fa

LOAD GENIE CLONE BOUNDS (TODO)
     mkdir -p ~/mm3/bed/genieBounds
     cd ~/mm3/bed/genieBounds
     wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz
     gunzip -c mgscv3_cb.bed.tgz | tar xvf -
     - Trim the track definition from each file (these are actually custom 
       track files):
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed
     end
     hgLoadBed mm3 genieBounds *-fixed.bed

LOAD JACKSON LABS QTL (TODO)
    ssh hgwdev
    mkdir ~/mm3/bed/jaxQTL2
    # Save the email attachment from Sridhar Ramachandran at Jackson Labs
    # (bed 8+, jaxQTL2 format).
    # Strip the column headers and load into the database.  
    tail +2 QTLBedFormat.txt > jaxQTL2.bed
    hgLoadBed -noBin -tab -sqlTable=$HOME/kent/src/hg/lib/jaxQTL2.sql \
      mm3 jaxQTL2 jaxQTL2.bed


MAKING MOUSE AND RAT SYNTENY
#
syntenicBest.pl -db=mm3 -table=blastzBestHuman
smooth.pl
joinsmallgaps.pl
fillgap.pl -db=mm3 -table=blastzBestHuman
synteny2bed.pl
hgLoadBed mm3 syntenyHuman ucsc100k.bed

syntenicBest.pl -db=mm3 -table=blastzBestRat
smooth.pl
joinsmallgaps.pl
fillgap.pl -db=mm3 -table=blastzBestRat
synteny2bed.pl
hgLoadBed mm3 syntenyRat ucsc100k.bed



CREATING THE mm3Rn2L SAMPLE TRACK (a.k.a WIGGLE TRACK)
------------------------------------------------------
o - refer to the script at src/hg/sampleTracks/makeMm3Rn2.doc

# create MGC Genes track (markd 13 may 2003)
# this will be automated in the incremental genbank update
/cluster/store5/genbank/bin/i386/mgcDbLoad mm3 /cluster/store5/genbank/data/download/mgc/2003.05.06/mgcFullStatus.tab.gz 


# SWAPPING RAT RN3 BLASTZ ALIGNMENTS TO MOUSE-RAT (IN PROGRESS 2003-08-06 kate)

    ssh kkstore
    # Rat-mouse alignments were already run and processed into axt.  
    # Swap target and query to get mouse-rat alignments.  
    set aliDir = "/cluster/data/rn3/bed/blastz.mm3"
    set revAliDir = "/cluster/data/mm3/bed/blastz.rn3.swp"
    mkdir $revAliDir
    cd $revAliDir
    # axtBest will need .len files - copy those, swap S1<->S2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    # Swap target and query coords, then re-apportion alignments so that 
    # unsorted/chrN.axt has all the alignments with chrN as target.
    cat $aliDir/axtChrom/chr*.axt \
    | /cluster/bin/i386/axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | /cluster/bin/i386/axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    rm -r unsorted

    # Consolidate AXT files to chrom level, sort, pick best
    ssh kkstore
    set base="/cluster/data/mm3/bed/blastz.rn3.swp"
    set seq1_dir="/cluster/data/mm3.RM030619/mixedNib/"
    set seq2_dir="/cluster/data/rn3/softNib/"
    cd $base
    mkdir -p axtBest pslBest
    foreach f (axtChrom/chr*.axt)
      set chr=$f:t:r
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
    end

=======

MAKE THE affyU74 TRACK
# Recalculate alignments and load data using Affy consensus sequences
# for Affy mouse U74 chips instead of target sequences 
#[DONE, hartera, Feb 5, 2004]
----------------------------------
# Load up semi-local disk with target sequences for Affy mouse U74 chips.
ssh kkr1u00
mkdir -p /iscratch/i/affy
#       This /projects filesystem is not available on kkr1u00
#       but it is on kk
ssh kk
cp /projects/compbio/data/microarray/affyGnfMouse/sequences/U74*consensus.fa /iscratch/i/affy

ssh kkr1u00
iSync

# Run cluster job to do alignments
ssh kk
cd /cluster/data/mm3/bed
mkdir affyU74.2004-02-05/
cd affyU74.2004-02-05
mkdir run
cd run
mkdir psl
echo /scratch/hg/mm3/trfFa/*.fa | wordLine stdin > genome.lst
ls -1 /iscratch/i/affy/U74*consensus.fa > affy.lst
echo '#LOOP\n/cluster/bin/i386/blat -fine -mask=lower -minIdentity=95 -ooc=/scratch/hg/h/11.ooc  $(path1) $(path2) {check out line+ psl/$(root1)_$(root2).psl}\n#ENDLOOP' > gsub
gensub2 genome.lst affy.lst gsub spe

para create spec
para try
# Then do para check/para push etc. until job is done, or para make. 
# para time 2004-02-05
# Completed: 1662 of 1662 jobs
# CPU time in finished jobs:      12092s     201.53m     3.36h    0.14d  0.000 y
# IO & Wait Time:                  4374s      72.90m     1.21h    0.05d  0.000 y
# Average job time:                  10s       0.17m     0.00h    0.00d
# Longest job:                       37s       0.62m     0.01h    0.00d
# Submission to last job:          1801s      30.02m     0.50h    0.02d

# Do sort, best in genome filter, and convert to chromosome coordinates
# to create affyU74.psl.
pslSort dirs raw.psl tmp psl
                                                                                # change filter parameters for these sequences. only use alignments that
# cover 30% of sequence and have at least minAli = 0.95.
# minAli = 0.97 too high. low minCover as a lot of n's in these sequences
pslReps -minCover=0.3 -sizeMatters -minAli=0.95 -nearTop=0.005 raw.psl contig.psl /dev/null
liftUp ../all_affyU74.psl ../../../jkStuff/liftAll.lft warn contig.psl
cd ..

# Sort by chromosome and load into database.
ssh hgwdev
cd /cluster/data/mm3/bed/affyU74.2004-02-05
pslSortAcc nohead chrom temp all_affyU74.psl
cat chrom/*.psl > affyU74.psl
# Feb 24, 2004, shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table, this is so qName matches those in interdependent
# tables in hgFixed
hgLoadPsl mm3 affyU74.psl -tNameIx
rm -r chrom temp run

MAKE THE affyGnfU74 TRACKs
# Make bed files and load consensus sequences for Affy U74 chip set.
# [DONE, hartera, Feb 5, 2004]
----------------------------------

#This needs to be done after affyU74 is already made.
# Added path for latest version of affyPslAndAtlasToBed
ssh hgwdev
mkdir -p /cluster/data/mm3/bed/affyGnf.2004-02-05
cd /cluster/data/mm3/bed/affyGnf.2004-02-05
affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74 affyGnfU74A.bed affyGnfU74A.exp -newType -chip=U74Av2
affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt affyGnfU74B.bed affyGnfU74B.exp -newType -chip=U74Bv2
affyPslAndAtlasToBed ../affyU74.2004-02-05/affyU74.psl /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt affyGnfU74C.bed affyGnfU74C.exp -newType -chip=U74Cv2

# Feb 24, 2004, shorten qName to "xxxx_at" instead of "U74Xv2:xxxx_at;"
# and reload data into table
hgLoadBed mm3 affyGnfU74A affyGnfU74A.bed
hgLoadBed mm3 affyGnfU74B affyGnfU74B.bed
hgLoadBed mm3 affyGnfU74C affyGnfU74C.bed

# Add in sequence data for U74 tracks.
# Copy consensus sequence to /gbdb if it isn't already
# [DONE, hartera, Feb 6, 2004]
    mkdir -p /gbdb/hgFixed/affyProbes
    cd /gbdb/hgFixed/affyProbes
    ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Av2_consensus.fa .
    ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Bv2_consensus.fa .
    ln -s /projects/compbiodata/microarray/affyGnfMouse/sequences/U74Cv2_consensus.fa .
    # use perl -pi.bak -e 's/;/ /' <file> to remove ";" after probe name
    # in sequence files.
    # reload sequences with "U74Xv2" prefix removed so acc matches name used
    # in other dependent tables.
    hgLoadSeq -abbr=U74Av2: mm3 /gbdb/hgFixed/affyProbes/U74Av2_consensus.fa
    hgLoadSeq -abbr=U74Bv2: mm3 /gbdb/hgFixed/affyProbes/U74Bv2_consensus.fa
    hgLoadSeq -abbr=U74Cv2: mm3 /gbdb/hgFixed/affyProbes/U74Cv2_consensus.fa
    
MAKE ALIAS TABLES FOR hgFind
----------------------------------

o Create alias tables to facilitate hgFind.

  First create tables of kgXref, kgAlias and kgProtAlias in mm3, using

	kgXref.sql
	kgAlias.sql  
	kgProtAlias.sql  

o Build kgXref table
 
  Generate xref .tab file for KG

	kgXref mm3 proteins0305

  Load it into mySQL

	load data local infile "kgXref.tab" into table mm3.kgXref;
  
o Build gene aliases

  Generate aliases from hugo, etc

	kgAliasM mm3 proteins0305

  Generate gene aliases from SWISS-PROT data 

    kgAliasP mm3 /cluster/store5/swissprot/0305/sprot.dat      sp.lis
    kgAliasP mm3 /cluster/store5/swissprot/0305/trembl.dat     tr.lis
    kgAliasP mm3 /cluster/store5/swissprot/0305/trembl_new.dat new.lis
    cat sp.lis tr.lis new.lis |sort|uniq >kgAliasP.tab
    rm  sp.lis tr.lis new.lis 

  Generate gene aliases from RefSeq data

    	kgAliasRefseq mm3 

  Concatenate all 3 files

	cat kgAliasM.tab kgAliasRefseq.tab kgAliasP.tab|sort|uniq > kgAlias.tab

  Load it into mySQL table

	load data local infile "kgAlias.tab" into table mm3.kgAlias;

o Build protein aliases

  Generate protein aliases

	kgProtAlias mm3 proteins0305

  Generate protein aliases from NCBI data

	kgProtAliasNCBI mm3

  Concatenate both files

	cat kgProtAliasNCBI.tab kgProtAlias.tab|sort|uniq > kgProtAliasBoth.tab

  Load it into mySQL tables

	load data local infile "kgProtAliasBoth.tab" into table mm3.kgProtAlias;

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/mm3/bed/
     mkdir -p kgMm3/index
     cd kgMm3/index
     hgKgGetText mm3 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/mm3/bed/kgMm3/index/knownGene.ix /gbdb/mm3/knownGene.ix
     ln -s /cluster/data/mm3/bed/kgMm3/index/knownGene.ixx /gbdb/mm3/knownGene.ixx


######## MAKING FOLDUTR TABLES #######
#
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    mkdir -p /cluster/data/mm3/bed/rnaStruct
    cd /cluster/data/mm3/bed/rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa mm3 knownGene utr3 utr3/utr.fa
    utrFa mm3 knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    ssh kk
    cd /cluster/data/mm3/bed/rnaStruct
    faSplit sequence utr3/utr.fa 50000 utr3/split/s >/dev/null
    faSplit sequence utr5/utr.fa 50000 utr5/split/s >/dev/null
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    cp gsub ../utr5

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
#CPU time in finished jobs:     842416s   14040.26m   234.00h    9.75d  0.027 y
#IO & Wait Time:                 78541s    1309.02m    21.82h    0.91d  0.002 y
#Average job time:                  32s       0.53m     0.01h    0.00d
#Longest job:                     3318s      55.30m     0.92h    0.04d
#Submission to last job:          4282s      71.37m     1.19h    0.05d

# Do cluster run for 5' UTRs 
    cd ../utr5
    gensub2 in.lst single gsub spec
    para create spec
    para try
    para push
#Completed: 25808 of 25808 jobs
#CPU time in finished jobs:      51700s     861.67m    14.36h    0.60d  0.002 y
#IO & Wait Time:                114430s    1907.16m    31.79h    1.32d  0.004 y
#Average job time:                   6s       0.11m     0.00h    0.00d
#Longest job:                     1044s      17.40m     0.29h    0.01d
#Submission to last job:          1164s      19.40m     0.32h    0.01d


# Load database
    ssh hgwdev
    cd /cluster/data/mm3/bed/rnaStruct/utr5
    hgLoadRnaFold mm3 foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold mm3 foldUtr3 fold

# Clean up
    rm -r split fold err batch.bak
    cd ../utr3
    rm -r split fold err batch.bak


######## MAKING FAMILY BROWSER TABLES #######
# These are instructions for building the
# neighborhood browser.  Don't start these until
# there is a knownGene track.


# Cluster together various alt-splicing isoforms.
ssh hgwdev
hgClusterGenes mm3 knownGene knownIsoforms knownCanonical

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir -p /cluster/data/mm3/bed/blastp
cd /cluster/data/mm3/bed/blastp
pepPredToFa mm3 knownGenePep known.faa
formatdb -i known.faa -t known -n known

# Copy over database to bluearc
mkdir -p /cluster/bluearc/mm3/blastp
if (-e /cluster/bluearc/mm3/blastp) then
   rm -r /cluster/bluearc/mm3/blastp
endif
mkdir -p /cluster/bluearc/mm3/blastp
cp /cluster/data/mm3/bed/blastp/known.* /cluster/bluearc/mm3/blastp

# Load up cluster/bluearc with blastp and related files
# if necessary
if (! -e /cluster/bluearc/blast/blastall) then
    mkdir -p /cluster/bluearc/blast
    cp /projects/compbio/bin/i686/blastall /cluster/bluearc/blast
    mkdir -p /cluster/bluearc/blast/data
    cp /projects/compbio/bin/i686/data/* /cluster/bluearc/blast/data
endif

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm3/bed/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk9
mkdir -p /cluster/data/mm3/bed/blastp/self
cd /cluster/data/mm3/bed/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/mm3/blastp/known -i \$1 -o \$2 -e 0.01 -m 8 -b 1000
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push
# This should finish in ~10 minutes if the cluster is free.

#Completed: 7733 of 7733 jobs
#CPU time in finished jobs:      41604s     693.39m    11.56h    0.48d  0.001 y
#IO & Wait Time:                 20190s     336.51m     5.61h    0.23d  0.001 y
#Average job time:                   8s       0.13m     0.00h    0.00d
#Longest job:                       35s       0.58m     0.01h    0.00d
#Submission to last job:           690s      11.50m     0.19h    0.01d

# Load into database.  This takes about an hour.
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/self/run/out
hgLoadBlastTab mm3 knownBlastTab *.tab

# Create table that maps between known genes and RefSeq
hgMapToGene mm3 refGene knownGene knownToRefSeq

# Create table that maps between known genes and LocusLink (DONE 5/5/04 Fan)
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" mm3 \
        > refToLl.txt
hgMapToGene mm3 refGene knownGene knownToLocusLink -lookup=refToLl.txt
#       row count is 30116 

# Create a table that maps between known genes and 
# the nice affy expression data.
hgMapToGene mm3 affyU74  knownGene knownToU74

# Format and load the GNF data
cd /cluster/data/mm3/bed
mkdir affyGnf95
cd affyGnf95
affyPslAndAtlasToBed -newType ../affyU95.psl /projects/compbio/data/microarray/affyGnfHuman/data_public_U95 affyGnfU95.tab affyGnfU95Exps.tab -shortOut
hgsql mm3 < ~/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.
hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance -lookup=knownToU74
hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance -lookup=knownToU74
hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance -lookup=knownToU74

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.

# Create knownToEnsembl column
hgMapToGene mm3 ensGene knownGene knownToEnsembl

# Make knownToCdsSnp column.  This is a little complicated by
# having to merge data form the snpTsc and the snpNih tracks.
hgMapToGene mm3 snpNih knownGene knownToCdsSnp -all -cds


# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
cd /cluster/data/ce1/bed
mkdir blastp
cd blastp
wget ftp://ftp.sanger.ac.uk/pub/databases/wormpep/wormpep
mv wormpep wormPep.faa
formatdb -i wormPep.faa -t wormPep -n wormPep
ssh kkr1u00
if (-e /cluster/bluearc/ce1/blastp) then
   rm -r /cluster/bluearc/ce1/blastp
endif
mkdir -p /cluster/bluearc/ce1/blastp
cp /cluster/data/ce1/bed/blastp/wormPep.p?? /cluster/bluearc/ce1/blastp

# Make parasol run directory 
ssh kk
mkdir -p /cluster/data/mm3/bed/blastp/ce1
cd /cluster/data/mm3/bed/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/ce1/blastp/wormPep -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# This should finish in ~10 minutes if the cluster is free.
# Here's the para time results
#Completed: 7733 of 7733 jobs
#CPU time in finished jobs:      22954s     382.56m     6.38h    0.27d  0.001 y
#IO & Wait Time:                 20560s     342.67m     5.71h    0.24d  0.001 y
#Average job time:                   6s       0.09m     0.00h    0.00d
#Longest job:                       16s       0.27m     0.00h    0.00d
#Submission to last job:           618s      10.30m     0.17h    0.01d


# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/ce1/run/out
hgLoadBlastTab mm3 ceBlastTab -maxPer=1 *.tab

# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to cluster/bluearc
# if it doesn't exist already
cd /cluster/data/hg16/bed
mkdir blastp
cd blastp
pepPredToFa hg16 knownGenePep known.faa
formatdb -i known.faa -t known -n known
ssh kkr1u00
if (-e /cluster/bluearc/hg16/blastp) then
   rm -r /cluster/bluearc/hg16/blastp
endif
mkdir -p /cluster/bluearc/hg16/blastp
cp /cluster/data/hg16/bed/blastp/known.p?? /cluster/bluearc/hg16/blastp

# Make parasol run directory 
ssh kk9
mkdir -p /cluster/data/mm3/bed/blastp/hg16
cd /cluster/data/mm3/bed/blastp/hg16
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/hg16/blastp/known -i \$1 -o \$2 -e 0.001 -m 8 -b 1
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/hg16/run/out
hgLoadBlastTab mm3 hgBlastTab -maxPer=1 *.tab

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
cd /cluster/data/dm1/bed
mkdir blastp
cd blastp
wget ftp://ftp.ensembl.org/pub/current_zebrafish/data/fasta/pep/Danio_rerio.ZFISH2.pep.fa.gz 
zcat Dan*.pep.fa.gz > ensembl.faa
echo "Translation:" > subs.in
subs -e ensembl.faa > /dev/null
formatdb -i ensembl.faa -t ensembl -n ensembl
if (-e /cluster/bluearc/dr1/blastp) then
   rm -r /cluster/bluearc/dr1/blastp
endif
mkdir -p /cluster/bluearc/dr1/blastp
cp /cluster/data/dr1/bed/blastp/ensembl.p?? /cluster/bluearc/dr1/blastp

# Make parasol run directory 
ssh kk9
mkdir -p /cluster/data/mm3/bed/blastp/dr1
cd /cluster/data/mm3/bed/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/dr1/blastp/ensembl -i \$1 -o \$2 -e 0.005 -m 8 -b 1
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/dr1/run/out
hgLoadBlastTab mm3 drBlastTab -maxPer=1 *.tab

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
cd /cluster/data/sc1/bed
mkdir blastp
cd blastp
wget ftp://genome-ftp.stanford.edu/pub/yeast/data_download/sequence/genomic_sequence/orf_protein/orf_trans.fasta.gz
zcat orf_trans.fasta.gz > sgd.faa
echo "ORFP:|" > subs.in
subs -e sgd.faa > /dev/null
formatdb -i sgd.faa -t sgd -n sgd
if (-e /cluster/bluearc/sc1/blastp) then
   rm -r /cluster/bluearc/sc1/blastp
endif
mkdir -p /cluster/bluearc/sc1/blastp
cp /cluster/data/sc1/bed/blastp/sgd.p?? /cluster/bluearc/sc1/blastp

# Make parasol run directory 
ssh kk9
mkdir -p /cluster/data/mm3/bed/blastp/sc1
cd /cluster/data/mm3/bed/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/sc1/blastp/sgd -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/sc1/run/out
hgLoadBlastTab mm3 scBlastTab -maxPer=1 *.tab

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
cd /cluster/data/dm1/bed
mkdir blastp
cd blastp
wget ftp://ftp.fruitfly.org/pub/download/dmel_RELEASE3-1/FASTA/whole_genome_translation_dmel_RELEASE3-1.FASTA.gz
zcat whole_ge*.gz | faFlyBaseToUcsc stdin flyBase.faa
formatdb -i flyBase.faa -t flyBase -n flyBase
if (-e /cluster/bluearc/dm1/blastp) then
   rm -r /cluster/bluearc/dm1/blastp
endif
mkdir -p /cluster/bluearc/dm1/blastp
cp /cluster/data/dm1/bed/blastp/flyBase.p?? /cluster/bluearc/dm1/blastp

# Make parasol run directory 
ssh kk
mkdir -p /cluster/data/mm3/bed/blastp/dm1
cd /cluster/data/mm3/bed/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat > blastSome <<end
#!/bin/csh
setenv BLASTMAT /cluster/bluearc/blast/data
/cluster/bluearc/blast/blastall -p blastp -d /cluster/bluearc/dm1/blastp/flyBase -i \$1 -o \$2 -e 0.01 -m 8 -b 1
end
chmod a+x blastSome

# Make gensub2 file
cat > gsub <<end
#LOOP
blastSome {check in line+ \$(path1)} {check out line out/\$(root1).tab}
#ENDLOOP
end

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub spec
para create spec
para try

# Wait a couple of minutes, and do a para check,  if all is good
# then do a
para push

#Results of this:
#Completed: 5814 of 5814 jobs
#CPU time in finished jobs:      62285s    1038.09m    17.30h    0.72d  0.002 y
#IO & Wait Time:                 23056s     384.26m     6.40h    0.27d  0.001 y
#Average job time:                  15s       0.24m     0.00h    0.00d
#Longest job:                      152s       2.53m     0.04h    0.00d
#Submission to last job:           290s       4.83m     0.08h    0.00d


# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/blastp/dm1/run/out
hgLoadBlastTab mm3 dmBlastTab -maxPer=1 *.tab

# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
    zcat /cluster/store1/superFamily/genomes/ass_26-Oct-2003.tab.gz | hgKnownToSuper mm3 mm stdin

# LOAD ECgene tables (re-done with new ldHgGene (braney 2004-01-30))
    cd /cluster/data/mm3/bed
    rm -f ECgene
    mkdir ECgene.2003-12-19
    ln -s ECgene.2003-12-19 ECgene
    cd ECgene
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_mm3_v1.1_25oct2003_genes.txt.gz"
    wget "http://genome.ewha.ac.kr/ECgene/download/ECgene_mm3_v1.1_25oct2003_genepep.txt.gz"
    gunzip *.gz
    ldHgGene -predTab mm3 ECgene ECgene_mm3_v1.1_25oct2003_genes.txt
    hgPepPred mm3 tab ECgenePep ECgene_mm3_v1.1_25oct2003_genepep.txt
    rm genePred.tab
    gzip *

######## FAMILY BROWSER UPDATE #######  (WORKING - 2004-02-25 - Hiram)
# These are instructions for building the
# neighborhood browser.  Don't start these until
# there is a knownGene track. and the affy tracks

# Cluster together various alt-splicing isoforms.
#	Creates the knownIsoforms and knownCanonical tables
ssh hgwdev
mkdir /cluster/data/mm3/bed/famBro.2004-02-25
ln -l /cluster/data/mm3/bed/famBro.2004-02-25 /cluster/data/mm3/bed/famBro
cd /cluster/data/mm3/bed/famBro
hgClusterGenes mm3 knownGene knownIsoforms knownCanonical
#	Got 22306 clusters, from 38639 genes in 37 chromosomes
#	row count on knownIsoforms went from 35639 to 38631
#	row count on knownCanonical went from 21381 to 22306

# Extract peptides from knownGenes into fasta file
# and create a blast database out of them.
mkdir /cluster/data/mm3/bed/famBro/blastp
cd /cluster/data/mm3/bed/famBro/blastp
pepPredToFa mm3 knownGenePep known.faa
#	You may need to build this binary in src/hg/near/pepPredToFa
#
#	expect to find /scratch/blast loaded with all required blast files
/scratch/blast/formatdb -i known.faa -t known -n known

# Copy over database to bluearc
rm -fr /cluster/bluearc/mm3/blastp
mkdir /cluster/bluearc/mm3/blastp
cp -p /cluster/data/mm3/bed/famBro/blastp/known.* /cluster/bluearc/mm3/blastp

# Split up fasta file into bite sized chunks for cluster
cd /cluster/data/mm3/bed/famBro/blastp
mkdir split
faSplit sequence known.faa 8000 split/kg

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/self
cd /cluster/data/mm3/bed/famBro/blastp/self
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall -p blastp \
	-d /cluster/bluearc/mm3/blastp/known -i $1 -o $2 -e 0.01 -m 8 -b 1000
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat  << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	'ls ../../split/*.fa' is too much, hence the echo
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# This should finish in ~5 minutes
# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:     104406s    1740.10m    29.00h    1.21d  0.003 y
# IO & Wait Time:                 24064s     401.06m     6.68h    0.28d  0.001 y
# Average job time:                  17s       0.28m     0.00h    0.00d
# Longest job:                      114s       1.90m     0.03h    0.00d
# Submission to last job:           395s       6.58m     0.11h    0.00d

# Load into database.  This takes about 15 minutes.
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/self/run/out
time hgLoadBlastTab mm3 knownBlastTab *.tab
# Scanning through 7737 files
# Loading database with 7621524 rows
#	real    15m43.875s
#	user    3m0.350s
#	sys     0m22.970s

# Create table that maps between known genes and RefSeq
hgMapToGene mm3 refGene knownGene knownToRefSeq
#	row count goes from 27744 to 30116

# Create a table that maps between known genes and 
# the nice affy expression data.
#	fixup problem with names
cd /cluster/data/mm3/bed/famBro
hgMapToGene mm3 affyU74  knownGene knownToU74
#	row count went from 17392 to 25733

#	I believe this GNF business is taken care of elsewhere, or at
#	least it only needs to be done once.
# Format and load the GNF data
# mkdir /cluster/data/mm3/bed/affyGnf95
# cd /cluster/data/mm3/bed/affyGnf95
# affyPslAndAtlasToBed -newType ../affyU95.psl \
# 	/projects/compbio/data/microarray/affyGnfHuman/data_public_U95 \
# 	affyGnfU95.tab affyGnfU95Exps.tab -shortOut

#	this .sql load was in preceeding instructions, but this .sql file
#	appears to not exist and it doesn't seem to be needed anyway.
#	Everything below this seems to create tables OK.
#  hgsql mm3 < ~/kent/src/hg/affyGnf/affyGnfU95.sql

# Create table that gives distance in expression space between 
# GNF genes.  The first one takes about 20 minutes, the other two 5 and 1
#	The affyGnfU74?Exps arguments appear to be unused in hgExpDistance
#  runs most efficiently in /tmp since it makes large temporary files
cd /tmp
hgExpDistance mm3 affyGnfU74A affyGnfU74AExps affyGnfU74ADistance \
	-lookup=knownToU74
#	Got 13497 unique elements in affyGnfU74A
#	row count went from 8256000 to 13497000
hgExpDistance mm3 affyGnfU74B affyGnfU74BExps affyGnfU74BDistance \
	-lookup=knownToU74
#	Got 8380 unique elements in affyGnfU74B
#	row count went from 6236000 to 8380000
hgExpDistance mm3 affyGnfU74C affyGnfU74CExps affyGnfU74CDistance \
	-lookup=knownToU74
#	Got 2286 unique elements in affyGnfU74C
#	row count went from 2003000 to 2286000

# Make sure that GO database is up to date.
See README in /cluster/store1/geneOntology.

# Create knownToEnsembl column
cd /cluster/data/mm3/bed/famBro
hgMapToGene mm3 ensGene knownGene knownToEnsembl
#	row count went from 31216 to 33504

# Make knownToCdsSnp column.
hgMapToGene mm3 snpNih knownGene knownToCdsSnp -all -cds
#	row count went from 5007 to 8368

# Make C. elegans ortholog column using blastp on wormpep.
# First make C. elegans protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/ce1/blastp should have data

# Create the ceBlastTab
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/ce1
cd /cluster/data/mm3/bed/famBro/blastp/ce1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/ce1/blastp/wormPep \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      51967s     866.12m    14.44h    0.60d  0.002 y
# IO & Wait Time:                 37548s     625.80m    10.43h    0.43d  0.001 y
# Average job time:                  12s       0.19m     0.00h    0.00d
# Longest job:                       39s       0.65m     0.01h    0.00d
# Submission to last job:           720s      12.00m     0.20h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/ce1/run/out
hgLoadBlastTab mm3 ceBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 24847 rows
#	row count went from 23017 to 24847

# Make human ortholog column using blastp on human known genes.
# First make human protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This already exists.  See makeMm3.doc for procedure
#	the directory: /cluster/bluearc/hg16/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/hg16
cd /cluster/data/mm3/bed/famBro/blastp/hg16
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/hg16/blastp/known \
	-i $1 -o $2 -e 0.001 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
#	this echo trick is used because otherwise the command line is
#	too long and you can not do a simple ls

echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:     119088s    1984.80m    33.08h    1.38d  0.004 y
# IO & Wait Time:                 39644s     660.73m    11.01h    0.46d  0.001 y
# Average job time:                  21s       0.34m     0.01h    0.00d
# Longest job:                      128s       2.13m     0.04h    0.00d
# Submission to last job:          1180s      19.67m     0.33h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/hg16/run/out
hgLoadBlastTab mm3 hgBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 32987 rows
#	row count went from 30452 to 32987

# Make Danio rerio (zebrafish) ortholog column using blastp on Ensembl.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dr1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/dr1
cd /cluster/data/mm3/bed/famBro/blastp/dr1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dr1/blastp/ensembl \
	-i $1 -o $2 -e 0.005 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      72923s    1215.38m    20.26h    0.84d  0.002 y
# IO & Wait Time:                 36576s     609.61m    10.16h    0.42d  0.001 y
# Average job time:                  14s       0.24m     0.00h    0.00d
# Longest job:                       74s       1.23m     0.02h    0.00d
# Submission to last job:          1015s      16.92m     0.28h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/dr1/run/out
hgLoadBlastTab mm3 drBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 29438 rows
#	row count went from 27260 to 29438

# Make Saccharomyces cerevisiae (yeast) ortholog column using blastp on RefSeq.
# First make protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/sc1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/sc1
cd /cluster/data/mm3/bed/famBro/blastp/sc1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/sc1/blastp/sgd \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      15329s     255.48m     4.26h    0.18d  0.000 y
# IO & Wait Time:                 26523s     442.05m     7.37h    0.31d  0.001 y
# Average job time:                   5s       0.09m     0.00h    0.00d
# Longest job:                       25s       0.42m     0.01h    0.00d
# Submission to last job:          1025s      17.08m     0.28h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/sc1/run/out
hgLoadBlastTab mm3 scBlastTab -maxPer=1 *.tab
# Scanning through 7737 files
# Loading database with 15757 rows
#	row count went from 14466 to 15757

# Make Drosophila melanagaster ortholog column using blastp on FlyBase.
# First make SwissProt protein database and copy it to cluster/bluearc
# if it doesn't exist already
#	This is already done, see makeMm3.doc for procedure
#	the directory: /cluster/bluearc/dm1/blastp should have data

# Make parasol run directory 
ssh kk
mkdir /cluster/data/mm3/bed/famBro/blastp/dm1
cd /cluster/data/mm3/bed/famBro/blastp/dm1
mkdir run
cd run
mkdir out

# Make blast script
cat  << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/scratch/blast/data /scratch/blast/blastall \
	-p blastp -d /cluster/bluearc/dm1/blastp/flyBase \
	-i $1 -o $2 -e 0.01 -m 8 -b 1
'_EOF_'
chmod a+x blastSome

# Make gensub2 file
cat << '_EOF_' > gsub
#LOOP
blastSome {check in line+ $(path1)} {check out line out/$(root1).tab}
#ENDLOOP
'_EOF_'

# Create parasol batch
echo ../../split/*.fa | wordLine stdin > split.lst
gensub2 split.lst single gsub jobList
para make jobList

# Completed: 7737 of 7737 jobs
# CPU time in finished jobs:      59446s     990.77m    16.51h    0.69d  0.002 y
# IO & Wait Time:                 33702s     561.70m     9.36h    0.39d  0.001 y
# Average job time:                  12s       0.20m     0.00h    0.00d
# Longest job:                       41s       0.68m     0.01h    0.00d
# Submission to last job:           880s      14.67m     0.24h    0.01d

# Load into database.  
ssh hgwdev
cd /cluster/data/mm3/bed/famBro/blastp/dm1/run/out
hgLoadBlastTab mm3 dmBlastTab -maxPer=1 *.tab
#	Scanning through 7737 files
#	Loading database with 26299 rows
#	row count went from 24352 to 26299

# CREATING KNOWNtOsUPER (which enables superFamily stuff in hgNear/hgGene)
# First see if need to update superfamily data from 
# ftp server at supfam.mrc-lmb.cam.ac.uk following instructions
# in /cluster/store1/superFamily/genomes/README.ucsc.  Then
# make sure that knownToEnsembl and ensGtp tables are created, then:
ssh hgwdev
zcat /cluster/store1/superFamily/genomes/ass_11-Jan-2004.tab.gz | \
	hgKnownToSuper mm3 mm stdin
#	row count went from 28955 to 29826

    # Update the proteins link into gdbPdb.hgcentraltest:
hgsql \
 -e 'update gdbPdb set proteomeDb = "proteins040115" where genomeDb = "mm3";' \
        -h genome-testdb hgcentraltest

# ECORES FROM GENOSCOPE [DONE, hartera, 2004-03-31]
# download data from  http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores
# ecotigMF - ecores on Mouse, genome conserved with Fugu
# ecotigMT - ecores on Mouse, genome conserved with Tetraodon

    ssh hgwdev
    mkdir /cluster/data/mm3/bed/ecores/
    # add parse_ecotig.pl to this directory

# FUGU
    mkdir /cluster/data/mm3/bed/ecores/fr1
    cd /cluster/data/mm3/bed/ecores/fr1/

    # download data for ecotigMF to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigMF > ecotigMF.bed

    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigMF.bed

    hgLoadBed -tab mm3 ecoresFr1 ecotigMF.bed
    # clean up
    rm *.bak

# TETRAODON
    mkdir /cluster/data/mm3/bed/ecores/tetraodon
    cd /cluster/data/mm3/bed/ecores/tetraodon/

    # download data for ecotigMT to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigMT > ecotigMT.bed
    # change from upper to lower case for "CHR"
    perl -pi.bak -e 's/CHR/chr/g' ecotigMT.bed
    
    hgLoadBed -tab mm3 ecoresTetraodon ecotigMT.bed

    # clean up
    rm *.bak

    # add entries in kent/src/hg/makeDb/trackDb/mouse/mm3/trackDb.ra
    # add html for details pages to this directory:
    # ecoresFr1.html and ecoresTetraodon.html

#  gc5Base wiggle TRACK (DONE - 2004-04-07 - Hiram)
    # Perform a gc count with a 5 base
    # window.  Also compute a "zoomed" view for display efficiency.
    mkdir /cluster/data/mm3/bed/gc5Base
    cd /cluster/data/mm3/bed/gc5Base

    #	in the script below, the 'grep -w GC' selects the lines of
    #	output from hgGcPercent that are real data and not just some
    #	information from hgGcPercent.  The awk computes the number
    #	of bases that hgGcPercent claimed it measured, which is not
    #	necessarily always 5 if it ran into gaps, and then the division
    #	by 10.0 scales down the numbers from hgGcPercent to the range
    #	[0-100].  Two columns come out of the awk print statement:
    #	<position> and <value> which are fed into wigAsciiToBinary through
    #	the pipe.  It is set at a dataSpan of 5 because each value
    #	represents the measurement over five bases beginning with
    #	<position>.  The result files end up in ./wigData5.
    cat << '_EOF_' > runGcPercent.sh
#!/bin/sh
mkdir -p wigData5
mkdir -p dataLimits5
for n in ../../nib/*.nib
do
	c=`basename ${n} | sed -e "s/.nib//"`
	C=`echo $c | sed -e "s/chr//"`
	echo -n "working on ${c} - ${C} ... "
	hgGcPercent -chr=${c} -doGaps \
		-file=stdout -win=5 mm3 ../../nib | grep -w GC | \
		awk '{printf "%d\t%.1f\n", $2+1, $5/10.0 }' | \
	wigAsciiToBinary \
	-dataSpan=5 -chrom=${c} -wibFile=wigData5/gc5Base_${C} \
	-name=${C} stdin 2> dataLimits5/${c}
echo "done"
done
'_EOF_'
    chmod +x runGcPercent.sh
    #	This is going to take perhaps two hours to run.  It is a lot of
    #	data.  make sure you do it on the fileserver:
    ssh eieio
    cd /cluster/data/mm3/bed/gc5Base
    time ./runGcPercent.sh > run.out 2>&1
    #	real    214m1.996s
    #	user    286m36.810s
    #	sys     10m46.500s

    # load the .wig files back on hgwdev:
    ssh hgwdev
    cd /cluster/data/mm3/bed/gc5Base
    hgLoadWiggle -pathPrefix=/gbdb/mm3/wib/gc5Base mm3 gc5Base wigData5/*.wig
    # and symlink the .wib files into /gbdb
    mkdir /gbdb/mm3/wib/gc5Base
    ln -s `pwd`/wigData5/*.wib /gbdb/mm3/wib/gc5Base

    # to speed up display for whole chromosome views, compute a "zoomed"
    # view and load that on top of the existing table.  The savings
    # comes from the number of data table rows the browser needs to load
    # for a full chromosome view.  Without the zoomed view there are
    # over 43,000 data rows for chrom 1.  With the zoomed view there are
    # only 222 rows needed for the display.  If your original data was
    # at 1 value per base the savings would be even greater.
    #	Pretty much the same data calculation
    # situation as above, although this time note the use of the
    # 'wigZoom -dataSpan=1000 stdin' in the pipeline.  This will average
    # together the data points coming out of the awk print statment over
    # a span of 1000 bases.  Thus each <position> coming out of wigZoom
    # will represent the measurement of GC in the next 1000 bases.  Note
    # the use of -dataSpan=1000 on the wigAsciiToBinary to account for
    # this type of data.  You want your dataSpan here to be an exact
    # multiple of your original dataSpan (5*200=1000) and on the order
    # of at least 1000, doesn't need to go too high.  For data that is
    # originally at 1 base per value, a convenient span is: -dataSpan=1024
    # A new set of result files ends up in ./wigData5_1K/*.wi[gb]
    cat << '_EOF_' > runZoom.sh
#!/bin/sh

mkdir -p wigData5_1K
mkdir -p dataLimits5_1K

for n in ../../nib/*.nib
do
        c=`basename ${n} | sed -e "s/.nib//"`
        C=`echo $c | sed -e "s/chr//"`
        echo -n "working on ${c} - ${C} ... "
        hgGcPercent -chr=${c} -doGaps \
                -file=stdout -win=5 mm3 ../../nib | grep -w GC | \
                awk '{printf "%d\t%.1f\n", $2+1, $5/10.0}' | \
	wigZoom -dataSpan=1000 stdin | wigAsciiToBinary \
        -dataSpan=1000 -chrom=${c} -wibFile=wigData5_1K/gc5Base_${C}_1K \
        -name=${C} stdin 2> dataLimits5_1K/${c}
echo "done"
done
'_EOF_'
    chmod +x runZoom.sh
    #	This is going to take just as long as above, certainly do this
    #	on the fileserver
    ssh eieio
    time ./runZoom.sh > zoom.out 2>&1
    #	real    210m41.651s
    #	user    283m40.720s
    #	sys     9m35.820s

    #	Then load these .wig files into the same database as above
    ssh hgwdev
    hgLoadWiggle -pathPrefix=/gbdb/mm3/wib/gc5Base -oldTable mm3 gc5Base \
	wigData5_1K/*.wig
    # and symlink these .wib files into /gbdb
    ln -s `pwd`/wigData5_1K/*.wib /gbdb/mm3/wib/gc5Base


# ANDY LAW CPGISSLANDS (DONE 6/17/04 angie)
    # See notes about this in makeGalGal2.doc.
    ssh kkstore
    mkdir /cluster/data/mm3/bed/cpgIslandGgfAndy
    cd /cluster/data/mm3/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandAndy.bed
    cp /dev/null cpgIslandGgfAndy.bed
    foreach f (../../?{,?}/chr*.fa)
      set chr = $f:t:r
      echo preproc $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.preproc
      echo running original on $chr
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndy.bed
      echo running modified on $chr
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndy.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/mm3/bed/cpgIslandGgfAndy
    # this one is a bed 4:
    hgLoadBed mm3 cpgIAndy -tab -noBin cpgIslandAndy.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndy/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndy.sql
    hgLoadBed mm3 cpgIslandGgfAndy -tab -noBin \
      -sqlTable=cpgIslandGgfAndy.sql cpgIslandGgfAndy.bed
    # WOW, even masking out repeat bases from the results, there's a huge 
    # increase in reported islands!!
    featureBits mm3 cpgIsland
#10102968 bases of 2505900260 (0.403%) in intersection
    featureBits mm3 cpgIslandGgfAndy
#61612798 bases of 2505900260 (2.459%) in intersection
    featureBits mm3 cpgIslandGgfAndy \!rmsk
#43063885 bases of 2505900260 (1.718%) in intersection
    wc -l ../cpgIsland/cpgIsland.bed *bed
#  15957 ../cpgIsland/cpgIsland.bed
# 152344 cpgIslandAndy.bed
# 114386 cpgIslandGgfAndy.bed
    # LaDeana asked for a run on masked seq for comparison.
    cp /dev/null cpgIslandAndyMasked.bed
    cp /dev/null cpgIslandGgfAndyMasked.bed
    foreach f (../../?{,?}/chr*.fa.masked)
      set chr = $f:t:r:r
      echo preproc masked $chr
      /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy $f > $chr.masked.preproc
      echo running original on $chr masked
      awk '{print $1 "\t" $2 "\t" ($3 + $4) "\t" $5;}' $chr.masked.preproc \
      | /cluster/home/angie/andy-cpg-island.pl \
      | perl -wpe '$i=0 if (not defined $i); \
                   chomp; ($s,$e) = split("\t"); $s--; \
                   $_ = "'$chr'\t$s\t$e\tcpg$i\n";  $i++' \
      >> cpgIslandAndyMasked.bed
      echo running modified on $chr masked
      /cluster/home/angie/ggf-andy-cpg-island.pl $chr.masked.preproc \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyMasked.bed
    end
    # load into database:
    ssh hgwdev
    cd /cluster/data/mm3/bed/cpgIslandGgfAndy
    # this one is a bed 4:
    hgLoadBed mm3 cpgIAndyMasked -tab -noBin cpgIslandAndyMasked.bed
    # this one is a cpgIslandExt but with a different table name:
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyMasked/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > cpgIslandGgfAndyMasked.sql
    hgLoadBed mm3 cpgIslandGgfAndyMasked -tab -noBin \
      -sqlTable=cpgIslandGgfAndyMasked.sql cpgIslandGgfAndyMasked.bed
    featureBits mm3 cpgIAndyMasked
#52735386 bases of 2505900260 (2.104%) in intersection
    featureBits mm3 cpgIslandGgfAndyMasked
#37169690 bases of 2505900260 (1.483%) in intersection
    wc -l cpgIslandAndyMasked.bed cpgIslandGgfAndyMasked.bed
#  94517 cpgIslandAndyMasked.bed
#  66504 cpgIslandGgfAndyMasked.bed
    # 6/28/04 -- masking simpleRepeats, and even repeats other than Alu's,
    # might not be the right thing to do (?).  Give it a try with less-masked 
    # sequence.
    ssh kkstore
    cd /cluster/data/mm3/bed/cpgIslandGgfAndy
    cp /dev/null cpgIslandGgfAndyOnlyRM.bed
    cp /dev/null cpgIslandGgfAndyOnlyRMAlu.bed
    foreach f (../../?{,?}/chr*.fa)
      set chr = $f:t:r
      echo preproc, ggf-andy $chr onlyRM
      maskOutFa $f $f.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRM.bed
      echo preproc, ggf-andy $chr onlyRMAlu
      head -3 $f.out > /tmp/tmp2.fa.out
      awk '$11 == "SINE/Alu" {print;}' $f.out >> /tmp/tmp2.fa.out
      maskOutFa $f /tmp/tmp2.fa.out stdout \
      | /cluster/home/angie/bin/$MACHTYPE/preProcGgfAndy stdin \
      | /cluster/home/angie/ggf-andy-cpg-island.pl \
      | perl -wpe 'chomp; ($s,$e,$cpg,$n,$c,$g,$oE) = split("\t"); $s--; \
                   $gc = $c + $g;  $pCpG = (100.0 * 2 * $cpg / $n); \
                   $pGc = (100.0 * $gc / $n); \
                   $_ = "'$chr'\t$s\t$e\tCpG: $cpg\t$n\t$cpg\t$gc\t" . \
                        "$pCpG\t$pGc\t$oE\n";' \
      >> cpgIslandGgfAndyOnlyRMAlu.bed
    end
# 110807 cpgIslandGgfAndyOnlyRMAlu.bed
#  66614 cpgIslandGgfAndyOnlyRM.bed
    ssh hgwdev
    cd /cluster/data/mm3/bed/cpgIslandGgfAndy
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRM/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed mm3 cpgIslandGgfAndyOnlyRM -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRM.bed
    sed -e 's/cpgIslandExt/cpgIslandGgfAndyOnlyRMAlu/g' \
      $HOME/kent/src/hg/lib/cpgIslandExt.sql > /tmp/c.sql
    hgLoadBed mm3 cpgIslandGgfAndyOnlyRMAlu -tab -noBin -sqlTable=/tmp/c.sql \
      cpgIslandGgfAndyOnlyRMAlu.bed
    featureBits mm3 cpgIslandGgfAndyOnlyRM
#37231991 bases of 2505900260 (1.486%) in intersection
    featureBits mm3 cpgIslandGgfAndyOnlyRMAlu
#58794013 bases of 2505900260 (2.346%) in intersection


# MAP ENCODE ORTHOLOG REGIONS (2004-07-09 kate)
    ssh kkstore
    cd /cluster/data/mm3/bed
    mkdir encodeRegions
    cd encodeRegions
    # Of 44 regions:
    # -minMatch=.20 -> 37 mapped
    liftOver -minMatch=.20 \
        /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
        /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain \
        encodeRegions.bed encodeRegions.unmapped
    ssh hgwdev
    cd /cluster/data/mm3/bed/encodeRegions
    hgLoadBed mm3 encodeRegions encodeRegions.bed -noBin

    ssh kkstore
    cd /cluster/data/mm3/bed/encodeRegions
    mkdir minMatch10
    cd minMatch10
    liftOver -minMatch=.10 \
        /cluster/data/hg16/bed/encodeRegions/encodeRegions.bed \
        /cluster/data/hg16/bed/liftOver/hg16Tomm3.chain \
        encodeRegions.bed encodeRegions.unmapped
    wc -l *
        # 42 encodeRegions.bed
        # 4 encodeRegions.unmapped
    ssh hgwdev
    cd /cluster/data/mm3/bed/encodeRegions/minMatch10
    hgLoadBed mm3 encodeRegionsNew encodeRegions.bed -noBin
    # NOTE: need to create this table with the schema
    # to preserve the Notes field


# LIFTOVER CHAINS TO MM6 (2005-09-25 kate)
    # NOTE: split mm6 is doc'ed in makeMm5.doc
    
    ssh kkstore01
    cp -r /cluster/data/mm6/liftSplits/ /san/sanvol1/scratch/mm6

    ssh pk
    cd /cluster/data/mm3
    makeLoChain-align mm3 /scratch/hg/mm3/chromTrfMixedNib \
        mm6 /san/sanvol1/scratch/mm6/liftSplits/split /scratch/hg/h/mouse11.ooc
        #Setting up blat in /cluster/data/mm3/bed/blat.mm6.2005-09-24/run
        # 1480 jobs
    cd bed
    rm -f blat.mm6
    ln -s blat.mm6.2005-09-24 blat.mm6
    cd blat.mm6/run
    para try
    para check

    # jobs tie up pk cluster (4G) -- the chrom sequence files should be split
    # up to create more, smaller cluster jobs

    # stopped with 1 hippo and 1 crashed.  Resubmit
# CPU time in finished jobs:    1586754s   26445.90m   440.77h   18.37d  0.050 y
# IO & Wait Time:                 86144s    1435.73m    23.93h    1.00d  0.003 y
# Time in running jobs:          111814s    1863.57m    31.06h    1.29d  0.004 y
# Average job time:                1132s      18.86m     0.31h    0.01d
# Longest running job:           111814s    1863.57m    31.06h    1.29d
# Longest finished job:           22685s     378.08m     6.30h    0.26d
# Submission to last job:         29164s     486.07m     8.10h    0.34d

    # retry chrUn vs chrUn (hippo) on kolossus
    # NOTE: this took _days_.  Needs to be split up.
    ssh kolossus
    blat /scratch/hg/mm3/chromTrfMixedNib/chrUn_random.nib /san/sanvol1/scratch/mm6/liftSplits/split/chrUn_random.fa ../raw/chrUn_random_chrUn_random.psl -tileSize=11 -ooc=/scratch/hg/h/mouse11.ooc -minScore=100 -minIdentity=98 -fastMap

    # lift results
    # this expects data in bed/blat.mm6, so symlink must be there 
    # use kolossus for speed
    ssh kolossus
    cd /cluster/data/mm3/bed/blat.mm6
    # NOTE: had to make a local copy of makeLoChain-lift and edit
    # to remove dependence on date of previous run.
    makeLoChain-lift mm3 mm6 /san/sanvol1/scratch/mm6/liftSplits/lift
                        >&! lift.log &
    tail -100f lift.log

    # chain alignments
    ssh kki
    cd /cluster/data/mm3/bed/blat.mm6
    makeLoChain-chain mm3 /cluster/data/mm3/nib mm6 /cluster/data/mm6/nib
        # Created parasol job in /cluster/data/mm3/bed/blat.mm6/chainRun
    cd /cluster/data/mm3/bed/blat.mm6/chainRun
    para try
        # 46 jobs
    para check
    para push

    ssh kkstore01
    cd /cluster/data/mm3/bed/blat.mm6
    makeLoChain-net mm3 mm6

    # GOT HERE
    # Can't load into liftOverChain table -- format has changed


# LIFTOVER CHAINS TO MM5 (WORKING 2004-09-29 kate)

    # run alignment
    # NOTE: split mm5 to /iscratch/i is doc'ed in makeHg17.doc
    ssh kk9
    cd /cluster/data/mm3
    makeLoChain-align mm3 /scratch/hg/mm3/chromTrfMixedNib \
        mm5 /iscratch/i/mm5/liftOver/liftSplit/split /scratch/hg/h/mouse11.ooc
        #Setting up blat in /cluster/data/mm3/bed/blat.mm5.2004-09-29
        # 1591 jobs
    cd bed
    rm -f blat.mm5
    ln -s blat.mm5.2004-09-29 blat.mm5
    cd blat.mm5/run
    para try
    para check
    para push
    # GOT HERE

    # lift results
    # the lift directory was defined in makeMm5.doc when split was performed
    # this expects data in bed/blat.mm5, so symlink must be there 
    # use kolossus for speed
    ssh kkstore
    cd /cluster/data/mm3/bed/blat.mm5
    makeLoChain-lift mm3 mm5 /iscratch/i/mm5/liftOver/liftSplit/lift \
                        >&! lift.log &
    tail -100f lift.log
        # 25 minutes 

    # GOT HERE


# HUMAN HG13 BLASTZ CLEANUP (DONE 2004-09-20 kate)
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.hg13
    nice rm -rf raw lav
    nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net
    nice gzip axtChrom/*.axt axtChain/{all.chain,*.net} axtChain/*Net/*.net
    nice gzip axtBest/*.axt pslChrom/*.psl

# HUMAN HG15 BLASTZ CLEANUP (DONE 2004-09-21 kate)
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.hg15.swap
    nice rm -rf raw lav
    nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net
    nice gzip axtChain/{all.chain,*.net} axtChain/*[nN]et/*.net
    nice gzip axtChain/chain2/*.chain
    nice gzip axt*/*.axt psl*/*.psl

# RAT RN2 BLASTZ CLEANUP (DONE 2004-09-21 kate)
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.rn2
    nice rm axtChain/run1/chain/* axtChain/preNet/*.chain axtChain/hNoClass.net
    nice gzip axtChain/{all.chain,*.net} axtChain/*[nN]et/*.net
    nice gzip axt*/*.axt psl*/*.psl
    rm psl*/psl.tab

# MOUSE MM3 BLASTZ CLEANUP (DONE 2004-09-21 kate)
    ssh kkstore
    cd /cluster/data/mm3/bed/blastz.mm3.2003-03-06-ASH
    nice gzip axt*/*.axt psl*/*.psl


