# This file describes how we made the browser database on the Rattus 
# Norvegicus genome, January 2003 update.

DOWNLOAD SEQUENCE (DONE 02/05/03)

    ssh eieio
    mkdir /cluster/store4/rn2
    cd /cluster/store4/rn2
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/README
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/conditions_for_use

    # Get BCM's chrom assemblies -- we will assemble our own chr*.fa from 
    # contig fa + agp, and cross-check against this.  
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un)
      mkdir $c
      wget -O $c/chr$c.fa.bcm.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.fa.gz
      wget -O $c/chr${c}_random.fa.bcm.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/chromosome/chr$c.random.fa.gz
    end

    # Get BCM's contig fa + agp.  We will split into our own conveniently-sized
    # pseudo-contigs, and assemble chrom fa.  
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/bacfile2-1.gz
    wget ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/record.dat.gz
    foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 X Un)
      wget -O $c/chr$c.agp.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.agp.gz
      wget -O $c/chr$c.contig.fa.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.contig.fa.gz
      wget -O $c/chr${c}_random.agp.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.agp.gz
      wget -O $c/chr${c}_random.contig.fa.gz \
       ftp://rat-ftp.hgsc.bcm.tmc.edu/pub/analysis/rat/contigs/chr$c.random.contig.fa.gz
      gunzip $c/chr$c.agp.gz
      gunzip $c/chr${c}_random.agp.gz
    end

BUILD AND CHECK CHROM-LEVEL SEQUENCE (DONE 02/05/03)

    # Make chrom fa:
    foreach c (?{,?})
      gunzip $c/chr$c.contig.fa.gz
      agpToFa -simpleMulti $c/chr$c.agp chr$c $c/chr$c.fa $c/chr$c.contig.fa
      if (-e $c/chr${c}_random.agp) then
        gunzip $c/chr${c}_random.contig.fa.gz
        agpToFa -simpleMulti $c/chr${c}_random.agp chr${c}_random \
          $c/chr${c}_random.fa $c/chr${c}_random.contig.fa
      endif
    end
    # Check that the size of each chromosome .fa file is equal to the 
    # last coord of the .agp:
    foreach f ( */*.agp )
      set agpLen = `tail -1 $f | awk '{print $3;}'`
      set g = $f:r
      set faLen = `faSize $g.fa | awk '{print $1;}'`
      if ($agpLen == $faLen) then
        echo $f length = $g length = $faLen
      else
        echo Error\!\!\!  $f length = $agpLen, but $g length = $faLen
      endif
    end
    # Check that our assembled chrom fa jive with the BCM chrom fa
    foreach c ( ?{,?} )
      set ucscLen = `faSize $c/chr$c.fa | awk '{print $1;}'`
      set bcmLen  = `gunzip -c $c/chr$c.fa.bcm.gz | faSize stdin \
                       | awk '{print $1;}'`
      if ($ucscLen == $bcmLen) then
        echo chr$c.fa length = chr$c.fa.bcm length = $bcmLen
      else
        echo Error\!\!\!  chr$c.fa length = $ucscLen, but chr$c.fa.bcm length = $bcmLen
      endif
      if (-e $c/chr${c}_random.fa) then
        set ucscLen = `faSize $c/chr${c}_random.fa | awk '{print $1;}'`
        set bcmLen  = `gunzip -c $c/chr${c}_random.fa.bcm.gz | faSize stdin \
                        | awk '{print $1;}'`
        if ($ucscLen == $bcmLen) then
          echo chr${c}_random.fa length = chr${c}_random.fa.bcm length = $bcmLen
        else
          echo Error\!\!\!  chr${c}_random.fa length = $ucscLen, but chr${c}_random.fa.bcm length = $bcmLen
        endif
      endif
    end

BREAK UP SEQUENCE INTO 5 MB CHUNKS AT NON_BRIDGED CONTIGS (DONE 02/05/03)

    ssh hgwdev
    cd into your CVS source tree under kent/src/hg/splitFaIntoContigs
    make

    # This will split the rat sequence into approx. 5 Mbase
    # supercontigs between non-bridged clone contigs and drop the
    # resulting dir structure in /cluster/store4/rn2.  The resulting
    # dir structure will include 1 dir for each chromosome, each of
    # which has a set of subdirectories, one subdir per supercontig.
    ssh eieio
    cd /cluster/store4/rn2
    foreach c (?{,?})
      cp -p $c/chr$c.agp $c/chr$c.agp.bak
      cp -p $c/chr$c.fa $c/chr$c.fa.bak
      splitFaIntoContigs $c/chr$c.agp $c/chr$c.fa . -nSize=5000000
      if (-e $c/chr${c}_random.fa) then
        cp -p $c/chr${c}_random.agp $c/chr${c}_random.agp.bak
        cp -p $c/chr${c}_random.fa $c/chr${c}_random.fa.bak
        splitFaIntoContigs $c/chr${c}_random.agp $c/chr${c}_random.fa . \
          -nSize=5000000
        mv ${c}_random/lift/oOut.lst $c/lift/rOut.lst
        mv ${c}_random/lift/ordered.lft $c/lift/random.lft
        mv ${c}_random/lift/ordered.lst $c/lift/random.lst
        rmdir ${c}_random/lift
        rm ${c}_random/chr${c}_random.{agp,fa}
        mv ${c}_random/* $c
        rmdir ${c}_random
      endif
    end
    # Make sure the reconstructed .fa jives with the original:
    foreach f ( */*.fa.bak )
      echo $f:r
      diff $f $f:r | wc -l
    end
    # The .agp goes through a slight format change, but make sure it 
    # at least ends up with the same number of lines:
    foreach f ( */*.agp.bak )
      set l1 = `wc -l $f | awk '{print $1;}'`
      set l2 = `wc -l $f:r | awk '{print $1;}'`
      if ($l1 == $l2) then
        echo "$f and $f:r have the same #lines"
      else
        echo Error\!\!\!  $f has $l1 lines, but $f:r has $l2
      endif
    end
    # Save some space
    foreach c (?{,?})
      echo $c
      gzip $c/chr*.contig.fa
    end
    rm */*.bak

COPY OVER JKSTUFF SCRIPTS DIRECTORY (DONE 02/05/03)

    ssh eieio
    ln -s /cluster/store4/rn2 ~/rn2
    rm -f ~/lastRn
    ln -s /cluster/store4/rn1 ~/lastRn
    cd ~/rn2
    cp -Rp ~/lastRn/jkStuff .
    rm jkStuff/*.{out,lst,lft} jkStuff/*~


CREATING DATABASE (DONE 02/06/03)

    # Create the database.
    ssh hgwdev
    # Enter mysql via:
    hgsql rn1
    # At mysql prompt type:
      create database rn2;
      quit
    # make a semi-permanent read-only alias:
    alias rn2 "mysql -u hguser -phguserstuff -A rn2"
    # Use df to ake sure there is at least 5 gig free on 
    # hgwdev:/var/lib/mysql


CREATING GRP TABLE FOR TRACK GROUPING (DONE 02/11/03)
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql rn2


REPEAT MASKING (DONE 03/06/03)
   Split contigs, run RepeatMasker, lift results
   Notes: 
   * If there is a new version of RepeatMasker, build it and ask the admins 
     to binrsync it (kkstore:/scratch/hg/RepeatMasker/*).
   * Contigs (*/chr*_*/chr*_*.fa) are split into 500kb chunks to make 
     RepeatMasker runs manageable on the cluster ==> results need lifting.
   * For the NCBI assembly we repeat mask on the sensitive mode setting
     (RepeatMasker -m -s)

        #- Split contigs into 500kb chunks:
        ssh eieio
        cd ~/rn2
        foreach d ( */chr*_?{,?} )
          cd $d
          set contig = $d:t
          faSplit size $contig.fa 500000 ${contig}_ -lift=$contig.lft \
            -maxN=500000
          cd ../..
        end

        #- Make the run directory and job list:
        cd ~/rn2
        mkdir RMRun
        rm -f RMRun/RMJobs
        touch RMRun/RMJobs
        foreach d ( ?{,?}/chr*_?{,?} )
          set ctg = $d:t
          foreach f ( $d/${ctg}_?{,?}.fa )
            set f = $f:t
            echo /cluster/bin/scripts/RMRat \
                 /cluster/store4/rn2/$d $f \
               '{'check out line+ /cluster/store4/rn2/$d/$f.out'}' \
              >> RMRun/RMJobs
          end
        end

        #- Do the run
        ssh kk
        cd ~/rn2/RMRun
        para create RMJobs
        para try, para check, para check, para push, para check,...

        #- Lift up the split-contig .out's to contig-level .out's
        ssh eieio
        cd ~/rn2
        foreach d ( ?{,?}/chr*_?{,?} )
          cd $d
          set contig = $d:t
          liftUp $contig.fa.out $contig.lft warn ${contig}_*.fa.out > /dev/null
          cd ../..
        end

        #- Lift up the contig-level .out's to chr-level
        cd ~/rn2
        ./jkStuff/liftOut5.sh

        # soft-mask contig .fa's with .out's
        foreach i (?{,?})
            foreach j ($i/chr${i}_?{,?}/chr${i}_?{,?}.fa \
                       $i/chr${i}_random_?{,?}/chr${i}_random_?{,?}.fa)
                maskOutFa $j $j.out $j -soft
            end
            echo done $i
        end

        #- Load the .out files into the database with:
        ssh hgwdev
        cd ~/rn2
        hgLoadOut rn2 ?{,?}/*.fa.out


MAKE LIFTALL.LFT (DONE 02/05/03)

    cd ~/rn2
    cat ?{,?}/lift/{ordered,random}.lft > jkStuff/liftAll.lft

VERIFY REPEATMASKER RESULTS (DONE 03/06/03)

    # Run featureBits on rn2 and on a comparable genome build, and compare:
    ssh hgwdev
    featureBits rn2 rmsk
    # --> 1100534407 bases of 2764911379 (39.804%) in intersection
    # --> (orig run, July libs) 1058156286 bases of 2764911379 (38.271%) in intersection
    featureBits rn1 rmsk
    # --> 1081814344 bases of 2852382926 (37.927%) in intersection


STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  (DONE 02/06/03)

    # Make (unmasked) nibs
    ssh eieio
    cd ~/rn2
    mkdir nib
    foreach f (?{,?}/chr*.fa)
      faToNib $f nib/$f:t:r.nib
    end
    # Make symbolic links from /gbdb/rn2/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/rn2/nib
    foreach f (/cluster/store4/rn2/nib/chr*.nib)
      ln -s $f /gbdb/rn2/nib
    end
    # Load /gbdb/rn2/nib paths into database and save size info.
     ssh hgwdev
     hgsql rn2  < ~/src/hg/lib/chromInfo.sql
     cd ~/rn2
     hgNibSeq -preMadeNib rn2 /gbdb/rn2/nib ?{,?}/chr?{,?}{,_random}.fa
     echo "select chrom,size from chromInfo" | hgsql -N rn2 > chrom.sizes


GOLD AND GAP TRACKS (DONE 02/06/03)
    ssh hgwdev
    cd ~/rn2
    hgGoldGapGl -noGl rn2 /cluster/store4/rn2 .


MAKE GCPERCENT (DONE 02/06/03)
     ssh hgwdev
     mkdir -p /cluster/store4/rn2/bed/gcPercent
     cd /cluster/store4/rn2/bed/gcPercent
     hgsql rn2  < ~/src/hg/lib/gcPercent.sql
     hgGcPercent rn2 ../../nib


MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR RN2 (DONE 02/06/03)
    # Enter rn2 into hgcentraltest.dbDb so test browser knows about it:
    mysql -h genome-testdb -u root -pbigSecret -A hgcentraltest
      insert into dbDb values("rn2", "Rat Jan. 2003",
        "/gbdb/rn2/nib", "Rat", "Napa", 1,
        20, "Rat");
      quit
    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P
    # Edit that makefile to add rn2 in all the right places and do
    make update
    make alpha
    cvs commit makefile


MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR RN2 (DONE 02/13/03)
    ssh hgwdev
    echo 'insert into blatServers values("rn2", "blat10", "17778", "1"); \
          insert into blatServers values("rn2", "blat10", "17779", "0");' \
      | hgsql -h genome-testdb hgcentraltest


SIMPLE REPEAT TRACK (DONE 02/07/03)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on eieio:
    ssh eieio
    mkdir ~/rn2/bed/simpleRepeat
    cd ~/rn2/bed/simpleRepeat
    mkdir trf
    rm -f jobs.csh
    touch jobs.csh
    foreach f (/cluster/store4/rn2/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa)
      set fout = $f:t:r.bed
      echo "/cluster/home/kent/bin/i386/trfBig -trf=/cluster/home/kent/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh |& tee jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l
    # When job is done do:
    liftUp simpleRepeat.bed ~/rn2/jkStuff/liftAll.lft warn trf/*.bed

    # Load this into the database as so
    ssh hgwdev
    cd ~/rn2/bed/simpleRepeat
    hgLoadBed rn2 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql


PROCESS SIMPLE REPEATS INTO MASK (DONE 02/07/03)

    # After the simpleRepeats track has been built, make a filtered version 
    # of the trf output: keep trf's with period <= 12:
    ssh eieio
    cd ~/rn2/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/chr*.bed)
      awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end
    # Lift up filtered trf output to chrom coords as well:
    cd ~/rn2
    mkdir -p bed/simpleRepeat/trfMaskChrom
    foreach c (?{,?})
      perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
        $c/lift/ordered.lst > $c/lift/oTrf.lst
      if (-e $c/lift/random.lst) then
        perl -wpe 's@(\S+)@bed/simpleRepeat/trfMask/$1.bed@' \
           $c/lift/random.lst > $c/lift/rTrf.lst
      endif
      liftUp bed/simpleRepeat/trfMaskChrom/chr$c.bed \
        jkStuff/liftAll.lft warn `cat $c/lift/oTrf.lst`
      if (-e $c/lift/rTrf.lst) then
        liftUp bed/simpleRepeat/trfMaskChrom/chr${c}_random.bed \
          jkStuff/liftAll.lft warn `cat $c/lift/rTrf.lst`
      endif
    end


MASK SEQUENCE WITH BOTH REPEATMASKER AND SIMPLE REPEAT/TRF (DONE 03/06/03)

    # This used to be done right after RepeatMasking.  Now, we mask with 
    # TRF as well, so do this after the "PROCESS SIMPLE REPEATS" step above.
    ssh eieio
    cd ~/rn2
    #- Soft-mask (lower-case) the contig and chr .fa's
    ./jkStuff/makeFaMasked.sh
    #- Make hard-masked .fa.masked files as well:
    ./jkStuff/makeHardMasked.sh
    #- Rebuild the nib, mixedNib, maskedNib files:
    ./jkStuff/makeNib.sh
    # Copy the masked contig fa to /scratch:
    ssh kkstore
    rm -rf /scratch/hg/rn2/trfFa
    mkdir -p /scratch/hg/rn2/trfFa
    cp -p ~/rn2/?{,?}/chr*_*/chr?{,?}{,_random}_?{,?}.fa /scratch/hg/rn2/trfFa


MAKE DOWNLOADABLE SEQUENCE FILES (DONE 03/06/03)
    ssh eieio
    cd ~/rn2
    #- Build the .zip files
    ./jkStuff/zipAll.sh |& tee zipAll.log
    mkdir zip
    mv *.zip* zip
    cd zip
    #- Look at zipAll.log to make sure all file lists look reasonable.  
    #- Check zip file integrity:
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test

    #- Copy the .zip files to hgwdev:/usr/local/apache/...
    ssh hgwdev
    cd ~/rn2/zip
    ../jkStuff/cpToWeb.sh
    cd /usr/local/apache/htdocs/goldenPath/rnJan2003
    #- Take a look at bigZips/* and chromosomes/*, update their README.txt's

    # Then make the upstream sequence files.
    cd bigZips
    featureBits rn2 refGene:upstream:1000 -fa=upstream1000.fa
    zip upstream1000.zip upstream1000.fa
    rm upstream1000.fa
    featureBits rn2 refGene:upstream:2000 -fa=upstream2000.fa
    zip upstream2000.zip upstream2000.fa
    rm upstream2000.fa
    featureBits rn2 refGene:upstream:5000 -fa=upstream5000.fa
    zip upstream5000.zip upstream5000.fa
    rm upstream5000.fa


PREPARE CLUSTER FOR BLASTZ RUN (DONE 03/06/03)
    # This needs to be done after trf-masking and nib generation.
    ssh kkstore
    # Extract lineage-specific repeats using Arian Smit's script:
    mkdir -p ~/rn2/bed/linSpecRep
    cd ~/rn2/bed/linSpecRep
    foreach f (~/rn2/*/chr*.out)
        ln -sf $f .
    end
    /cluster/bin/scripts/rodentSpecificRepeats.pl *.out
    /cluster/bin/scripts/perl-rename 's/(\.fa|\.nib)//' *.out.*spec
    /cluster/bin/scripts/perl-rename 's/\.(rod|prim)spec/.spec/' *.out.*spec
    rm *.out
    cd ..
    rm -rf /scratch/hg/rn2/linSpecRep
    mkdir -p /scratch/hg/rn2
    cp -Rp linSpecRep /scratch/hg/rn2
    # RepeatMasker .out:
    cd ~/rn2
    rm -rf /scratch/hg/rn2/rmsk
    mkdir -p /scratch/hg/rn2/rmsk
    cp -p ?{,?}/chr?{,?}{,_random}.fa.out /scratch/hg/rn2/rmsk
    # Chrom-level mixed nibs that have been repeat- and trf-masked:
    rm -rf /scratch/hg/rn2/chromTrfMixedNib
    mkdir -p /scratch/hg/rn2/chromTrfMixedNib
    cp -p mixedNib/chr*.nib /scratch/hg/rn2/chromTrfMixedNib
    # Ask cluster-admin@cse.ucsc.edu to binrsync /scratch/hg to clusters

    # Jim's comments Feb 12 '03 about the order in which to run blastz:
    # In general we should do
    # 1) hg/mm
    # 2) mm/rn
    # 3) rn/hg
    # 4) hg/hg
    # 5) mm/mm
    # 6) rn/rn
    # There is now an 'axtSwap' program that might let us
    # get out of having to run the inverse of 1,2 & 3,  though
    # 2 in particular is so fast perhaps it's just as well to
    # do the inverse explicitly.


MAKING AND STORING mRNA AND EST ALIGNMENTS  (DONE 02/09/03)

    # Load up the local disks of the cluster with refSeq.fa, mrna.fa and est.fa
    # from /cluster/store2/mrna.133  into /scratch/hg/mrna.133
    # Make sure that /scratch/hg/rn2/trfFa is loaded with chr*_*.fa and pushed 
    # to the cluster nodes.  
    ssh kk
    cd ~/rn2/bed
    foreach i (refSeq mrna est)
      mkdir -p $i
      cd $i
      ls -1S /scratch/hg/rn2/trfFa/* > genome.lst
      ls -1 /mnt/scratch/hg/mrna.133/Rattus_norvegicus/$i.fa > mrna.lst
      cp ~/lastRn/bed/$i/gsub .
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
      cd ..
    end 

    # In each dir: para try, para check, para push, para check....
    # para time > time
      
    # Process refSeq, mRNA, and EST alignments into near best in genome.
    ssh eieio
    cd ~/rn2/bed
    cd refSeq
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minCover=0.2 -sizeMatters -minAli=0.98 -nearTop=0.002 raw.psl \
      contig.psl /dev/null
    liftUp -nohead all_refSeq.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_refSeq.psl
    cd ..

    cd mrna
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_mrna.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_mrna.psl
    cd ..

    cd est
    pslSort dirs raw.psl /cluster/store2/temp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl contig.psl \
      /dev/null
    liftUp -nohead all_est.psl ../../jkStuff/liftAll.lft warn contig.psl
    pslSortAcc nohead chrom /cluster/store2/temp all_est.psl
    cd ..

    # Load mRNA alignments into database.
    ssh hgwdev
    cd ~/rn2/bed/mrna/chrom
    foreach i (chr?{,?}{,_random}.psl)
        mv $i $i:r_mrna.psl
    end
    hgLoadPsl rn2 *.psl
    cd ..
    hgLoadPsl rn2 all_mrna.psl -nobin

    # Load EST alignments into database.
    ssh hgwdev
    cd ~/rn2/bed/est/chrom
    foreach i (chr?{,?}{,_random}.psl)
        echo mv $i $i:r_est.psl
    end
    hgLoadPsl rn2 *.psl
    cd ..
    hgLoadPsl rn2 all_est.psl -nobin

    # Create subset of ESTs with introns and load into database.
    ssh eieio
    cd ~/rn2
    tcsh jkStuff/makeIntronEst.sh
    ssh hgwdev
    cd ~/rn2/bed/est/intronEst
    hgLoadPsl rn2 *.psl

    # Load refSeq alignments into database
    ssh hgwdev
    cd ~/rn2/bed/refSeq
    pslCat -dir chrom > refSeqAli.psl
    hgLoadPsl rn2 -tNameIx refSeqAli.psl


CREATE REFSEQ GENES TRACK (DONE 02/09/03)
    # Load the refSeq mRNA
    ssh hgwdev
    mkdir -p /gbdb/rn2/mrna.133
    ln -s /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.fa \
      /gbdb/rn2/mrna.133
    hgLoadRna new rn2
    hgLoadRna add -type=refSeq rn2 /gbdb/rn2/mrna.133/refSeq.fa \
      /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.ra

    # Produce refGene, refPep, refMrna, and refLink tables as so:
    # Get the proteins:
    ssh eieio
    cd ~/rn2/bed/refSeq
    wget ftp://ftp.ncbi.nih.gov/refseq/R_norvegicus/mRNA_Prot/rat.faa.gz
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/loc2ref
    wget ftp://ftp.ncbi.nih.gov/refseq/LocusLink/mim2loc
    gunzip rat.faa.gz
    ssh hgwdev
    cd ~/rn2/bed/refSeq
    hgRefSeqMrna rn2 \
      /gbdb/rn2/mrna.133/refSeq.fa \
      /cluster/store2/mrna.133/refSeq/org/Rattus_norvegicus/refSeq.ra \
      all_refSeq.psl loc2ref rat.faa mim2loc
    # Don't worry about the "No gene name" errors

    # Add RefSeq status info
    hgRefSeqStatus -rat rn2 loc2ref


REFFLAT (DONE 02/09/03)
    # create precomputed join of refFlat and refGene:
    echo 'CREATE TABLE refFlat (KEY geneName (geneName), KEY name (name), KEY chrom (chrom)) SELECT refLink.name as geneName, refGene.* FROM refLink,refGene WHERE refLink.mrnaAcc = refGene.name' | hgsql rn2


LOAD MRNA DATA (DONE 02/09/03)
    ssh hgwdev
    ln -s /cluster/store2/mrna.133/org/Rattus_norvegicus/mrna.fa /gbdb/rn2/mrna.133
    ln -s /cluster/store2/mrna.133/org/Rattus_norvegicus/est.fa /gbdb/rn2/mrna.133
    hgLoadRna add -type=mRNA rn2 /gbdb/rn2/mrna.133/mrna.fa \
      /cluster/store2/mrna.133/org/Rattus_norvegicus/mrna.ra
    hgLoadRna add -type=EST rn2 /gbdb/rn2/mrna.133/est.fa \
      /cluster/store2/mrna.133/org/Rattus_norvegicus/est.ra


PRODUCING ESTORIENTINFO TABLE (DONE 03/06/03)

This table is needed for proper orientation of ESTs in the
browser.  Many will appear on the wrong strand without it.
This involves a cluster run.  First load the EST psl files
as so:
     ssh eieio
     cd ~/rn2/bed/est
     pslSortAcc nohead contigs /cluster/store2/temp contig.psl
     ssh kkstore
     mkdir /mnt/scratch/hg/rn2/est
     cd ~/rn2/bed/est
     cp -r contigs /mnt/scratch/hg/rn2/est

Wait for these to finish.
     mkdir -p ~/rn2/bed/estOrientInfo
     cd ~/rn2/bed/estOrientInfo
     mkdir ei
     ls -1S /mnt/scratch/hg/rn2/est/contigs/* > psl.lst
     echo placeholder > single
     cp ~/rn1/bed/estOrientInfo/gsub .
Update gsub to refer to rat contig sequence currently on
/mnt//scratch/hg/rn2/trfFa, and rat ESTs on /mnt/scratch/hg/rn2/est/contigs
and the rat est in /scratch/hg/mrna.133/Rattus_norvegicus/est.fa.
     gensub2 psl.lst single gsub spec

     ssh kk
     para create spec
Then run the  job on the cluster
     cd ~/rn2/bed/estOrientInfo
     para try
     sleep 60
     para check
If things look good
     para push
Wait for this to finish then
     liftUp estOrientInfo.bed ../../jkStuff/liftAll.lft warn ei/*.tab
Load them into database as so:
     ssh hgwdev
     cd ~/rn2/bed/estOrientInfo
     hgLoadBed rn2 estOrientInfo estOrientInfo.bed \
       -sqlTable=/cluster/home/kent/src/hg/lib/estOrientInfo.sql
     
PRODUCING MRNAORIENTINFO TABLE (DONE 03/06/03)
    ssh eieio
    cd ~/rn2/bed/mrna
    pslSortAcc nohead contig /cluster/store2/temp contig.psl
    ssh kkstore
    mkdir /mnt/scratch/hg/rn2/mrna
    cp -r ~/rn2/bed/mrna/contig /mnt/scratch/hg/rn2/mrna
    mkdir -p ~/rn2/bed/mrnaOrientInfo/oi
    cd ~/rn2/bed/mrnaOrientInfo
    ls -1S /mnt/scratch/hg/rn2/mrna/contig/* > psl.lst
    cp ~/lastRn/bed/mrnaOrientInfo/gsub .
    echo placeholder > single
    gensub2 psl.lst single gsub spec

    ssh kk
    cd ~/rn2/bed/mrnaOrientInfo
    para create spec
    para try, para check, para push, para check,...
    liftUp mrnaOrientInfo.bed ../../jkStuff/liftAll.lft warn oi/*.tab

    ssh hgwdev
    cd ~/rn2/bed/mrnaOrientInfo
    hgLoadBed rn2 mrnaOrientInfo mrnaOrientInfo.bed \
       -sqlTable=/cluster/home/kent/src/hg/lib/mrnaOrientInfo.sql


CREATE RNACLUSTER TABLE (DONE 03/06/03)
    # Make sure that refSeqAli, estOrientInfo and mrnaOrientInfo tables are 
    # made already (see above).
    ssh hgwdev
    mkdir -p ~/rn2/bed/rnaCluster/chrom
    cd ~/rn2/bed/rnaCluster
    foreach i (~/rn2/?{,?})
      foreach f ($i/chr*.fa)
        set c = $f:t:r
        clusterRna rn2 /dev/null chrom/$c.bed -chrom=$c
        echo done $c
      end
    end
    hgLoadBed rn2 rnaCluster chrom/*.bed


PRODUCING GENSCAN PREDICTIONS (TODO - REDO)
    
    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. (genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    mkdir -p ~/rn2/bed/genscan
    cd ~/rn2/bed/genscan
    # Make 3 subdirectories for genscan to put their output files in
    mkdir gtf pep subopt
    # Generate a list file, genome.list, of all the hard-masked contigs that 
    # *do not* consist of all-N's (which would cause genscan to blow up)
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store4/rn2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa.masked` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end
    # Create template file, gsub, for gensub2.  For example (3-line file):
#LOOP
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push

    # If there are crashes, diagnose with "para problems".  
    # If a job crashes due to genscan running out of memory, re-run it 
    # manually with "-window=1200000" instead of "-window=2400000".
    # chr14_21, chr16_4
    
    # Convert these to chromosome level files as so:
    ssh eieio
    cd ~/rn2/bed/genscan
    liftUp genscan.gtf ../../jkStuff/liftAll.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ../../jkStuff/liftAll.lft warn subopt/*.bed
    cat pep/*.pep > genscan.pep

    # Load into the database as so:
    ssh hgwdev
    cd ~/rn2/bed/genscan
    ldHgGene rn2 genscan genscan.gtf
    hgPepPred rn2 generic genscanPep genscan.pep
    hgLoadBed rn2 genscanSubopt genscanSubopt.bed


SWAPPING HUMAN-RAT BLASTZ ALIGNMENTS TO RAT-HUMAN: (DONE 03/15/03)

    ssh eieio
    # Human-rat alignments were already run and processed into axt.  
    # Swap target and query to get rat-human alignments.  
    set aliDir = "/cluster/store4/gs.14/build31/bed/blastz.rn2.2003-03-13-ASH"
    set revAliDir = "/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP"
    mkdir $revAliDir
    cd $revAliDir
    # axtBest will need .len files - copy those, swap S1<->S2
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    # Swap target and query coords, then re-apportion alignments so that 
    # unsorted/chrN.axt has all the alignments with chrN as target.
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sorted the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    rm -r unsorted
    # Don't bother creating psl for these unfiltered alignments -- but 
    # tell Jim so he can do chaining/netting.


MAKING THE BLASTZBESTHUMAN TRACK FROM PENN STATE RN2 AXT FILES (DONE 03/15/03)

    # Consolidate AXT files to chrom level, sort, pick best, make psl.
    ssh eieio
    set base="/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP"
    set seq1_dir="/cluster/store4/rn2/mixedNib/"
    set seq2_dir="/cluster/store4/gs.14/build31/mixedNib/"
    set tbl="blastzBestHg13"
    cd $base
    mkdir -p axtBest pslBest
    foreach f (axtChrom/chr*.axt)
      set chr=$f:t:r
      echo axtBesting $chr
      axtBest axtChrom/$chr.axt $chr axtBest/$chr.axt -minScore=300
      echo translating axtBest to psl for $chr
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
    end
    # If some chromosome's alignments were too big and caused axtSort to 
    # run out of memory, split it in half (by 4-line axt records) and 
    # run axtBest just on the halves.  
    foreach chr (chr1)
      echo two-pass axtBesting $chr
      set len = `wc -l < axtChrom/$chr.axt`
      set numRec = `expr $len / 4`
      if (($numRec * 4) != $len) then
        echo "Uh-oh: length of axtChrom/$chr.axt is $len, not a multiple of 4"
        break
      endif
      set halfRec   = `expr $numRec / 2`
      set halfLen   = `expr $halfRec \* 4`
      set halfLenp1 = `expr $halfLen + 1`
      head -$halfLen   axtChrom/$chr.axt > axtChrom/$chr.h0.axt
      tail +$halfLenp1 axtChrom/$chr.axt > axtChrom/$chr.h1.axt
      axtBest axtChrom/$chr.h0.axt $chr axtChrom/$chr.h0.axtBest -minScore=300
      axtBest axtChrom/$chr.h1.axt $chr axtChrom/$chr.h1.axtBest -minScore=300
      cat axtChrom/$chr.h{0,1}.axtBest > axtBest/$chr.axt
      axtToPsl axtBest/$chr.axt S1.len S2.len pslBest/${chr}_${tbl}.psl
      rm axtChrom/$chr.h*
    end

    # Load tables
     ssh hgwdev
     set base="/cluster/store4/rn2/bed/blastz.hg13.2003-03-13-SWAP"
     set tbl="blastzBestHg13"
     cd $base/pslBest
     hgLoadPsl rn2 chr*_${tbl}.psl

    # Make /gbdb links and add them to the axtInfo table:
     mkdir -p /gbdb/rn2/axtBestHg13
     cd /gbdb/rn2/axtBestHg13
     foreach f ($base/axtBest/chr*.axt)
       ln -s $f .
     end
     cd $base/axtBest
     rm -f axtInfoInserts.sql
     touch axtInfoInserts.sql
     foreach f (/gbdb/rn2/axtBestHg13/chr*.axt)
       set chr=$f:t:r
       echo "INSERT INTO axtInfo VALUES ('hg13','Blastz Best in Genome','$chr','$f');" \
         >> axtInfoInserts.sql
     end
     hgsql rn2 < ~/kent/src/hg/lib/axtInfo.sql
     hgsql rn2 < axtInfoInserts.sql

MAKING THE HUMAN AXTTIGHT FROM AXTBEST (DONE 03/15/03)
    # After creating axtBest alignments above, use subsetAxt to get axtTight:
    ssh eieio
    cd ~/rn2/bed/blastz.hg13.2003-03-13-SWAP/axtBest
    mkdir -p ../axtTight
    foreach i (*.axt)
      subsetAxt  $i ../axtTight/$i \
        ~kent/src/hg/mouseStuff/subsetAxt/coding.mat 3400
    end
    # translate to psl
    cd ../axtTight
    mkdir -p ../pslTight
    foreach i (*.axt)
      set c = $i:r
      axtToPsl $i ../S1.len ../S2.len ../pslTight/${c}_blastzTightHg13.psl
    end
    # Load tables into database
    ssh hgwdev
    cd ~/rn2/bed/blastz.hg13.2003-03-13-SWAP/pslTight
    hgLoadPsl rn2 chr*_blastzTightHg13.psl


TWINSCAN GENE PREDICTIONS (DONE 03/26/03)

    mkdir -p ~/rn2/bed/twinscan
    cd ~/rn2/bed/twinscan
    wget http://genome.cse.wustl.edu/~bio/rat/Jan03/rat_Jan03_03-26-03.tgz
    gunzip -c *.tgz | tar xvf -
    rm -r chr_tx
    # clean up chrom field of GTF files
    foreach f (chr_gtf/chr*.gtf)
      set chr = $f:t:r
      sed -e "s/^[a-zA-Z0-9]*/$chr/" $f > chr_gtf/$chr-fixed.gtf
    end
    # pare down protein FASTA header to id and add missing .a:
    foreach f (chr_ptx/chr*.ptx)
      set chr = $f:t:r
      perl -wpe 's/^\>.*\s+source_id\s*\=\s*(\S+).*$/\>$1.a/;' < \
        chr_ptx/$chr.ptx > chr_ptx/$chr-fixed.fa
    end
    ldHgGene rn2 twinscan chr_gtf/chr*-fixed.gtf -exon=CDS
    hgPepPred rn2 generic twinscanPep chr_ptx/chr*-fixed.fa


PRODUCING CROSS_SPECIES mRNA ALIGMENTS (DONE 03/11/03)

    # Here you align non-mouse mRNAs against the masked genome on the
    # cluster you set up during the previous step.
    # Make sure that gbpri, gbmam, gbrod, and gbvert are downloaded from 
    # Genbank into /cluster/store2/genbank.133 and unpacked by organism into 
    # /cluster/store2/mrna.133/org. 

    # Set up cluster run more or less as so:
      ssh kk
      cd ~/rn2/bed
      mkdir xenoMrna
      cd xenoMrna
      ls -1S /scratch/hg/rn2/trfFa/* > genome.lst
      cp -R /cluster/store2/mrna.133/org /mnt/scratch/hg/mrna.133
    # The below ls command fails when you have too many files so skip it and 
    # instead run the find command after it.
    #      ls -1S /mnt/scratch/hg/mrna.133/org/*/mrna.fa > allMrna.lst
      find /mnt/scratch/hg/mrna.133/org -name mrna.fa -ls \
        | awk '{print $7,$11}' | grep -v /Rattus_norvegicus/ \
        | sort -gr | awk '{print $2}' \
        >  allMrna.lst
    # Put the first line of allMrna.lst into 1.org, the second line into 
    # 2.org, and so forth:
      foreach n (1 2 3 4 5 6)
        head -$n allMrna.lst | tail -1 > $n.org
      end
    # After the 6th line just leave the rest in 7.org.
      tail +7 allMrna.lst > 7.org
    # Then
      ls -1 *.org > mrna.lst
      cp ~/lastRn/bed/xenoMrna/gsub .
      mkdir psl
      gensub2 genome.lst mrna.lst gsub spec
      para create spec
      para try
      para check
    # If all looks well do
      para push

    # Sort xeno mRNA alignments as so:
       ssh eieio
       cd ~/rn2/bed/xenoMrna
       pslSort dirs raw.psl /cluster/store2/temp psl
       pslReps raw.psl cooked.psl /dev/null -minAli=0.25
       liftUp chrom.psl ../../jkStuff/liftAll.lft warn cooked.psl
       pslSortAcc nohead chrom /cluster/store2/temp chrom.psl
       pslCat -dir chrom > xenoMrna.psl
       rm -r chrom raw.psl cooked.psl chrom.psl

    # Load into database as so:
       ssh hgwdev
       cd ~/rn2/bed/xenoMrna
       hgLoadPsl rn2 xenoMrna.psl -tNameIx

    # Make the xenoRna file
       # Make a /gbdb symlink for the .fa (not .ra)
       cd /gbdb/rn2/mrna.133
       ln -s /cluster/store2/mrna.133/ratXenoRna.fa ratXenoRna.fa
       hgLoadRna add -type=xenoRna rn2 /gbdb/rn2/mrna.133/ratXenoRna.fa \
         /cluster/store2/mrna.133/ratXenoRna.ra


PRODUCING TETRAODON FISH ALIGNMENTS (TODO)

o - Download sequence from ... and put it on the cluster local disk
    at
       /scratch/hg/fish
o - Do fish/rat alignments.
       ssh kk
       cd ~/rn2/bed
       mkdir blatFish
       cd blatFish
       mkdir psl
       ls -1S /scratch/hg/fish/* > fish.lst
       ls -1S /scratch/hg/rn2/trfFa/* > rat.lst
       cp ~/lastRn/blatFish/gsub .
       gensub2 rat.lst fish.lst gsub spec
       para create spec
       para try
     Make sure jobs are going ok with para check.  Then
       para push
     wait about 2 hours and do another
       para push
     do para checks and if necessary para pushes until done
     or use para shove.
o - Sort alignments as so 
       pslCat -dir psl | liftUp -type=.psl stdout ~/rn2/jkStuff/liftAll.lft warn stdin | pslSortAcc nohead chrom /cluster/store2/temp stdin
o - Copy to hgwdev:/scratch.  Rename to correspond with tables as so and 
    load into database:
       ssh hgwdev
       cd ~/rn2/bed/blatFish/chrom
       foreach i (chr?{,?}{,_random}.psl)
           set r = $i:r
           mv $i ${r}_blatFish.psl
       end
       hgLoadPsl rn2 *.psl
       hgLoadRna addSeq rn2 /cluster/store2/fish/seq15jun2001/*.fa

# PRODUCING SQUIRT ALIGNMENTS  (DONE 2003-06-04 - braney)
    ssh kkstore
    mkdir -p ~/rn2/bed/blatCi1
    cd ~/rn2/bed/blatCi1
    ls -1S /iscratch/i/squirt/ci1/queryFa/*.fa > squirt.lst
    ls -1S /scratch/hg/rn2/trfFa/* > rat.lst

    rm -rf psl
    foreach ctg (`cat rat.lst`)
      mkdir -p psl/$ctg:t:r
    end
    # get gsub2D from someplace
    gensub2 rat.lst squirt.lst gsub2D spec

    ssh kk
    cd ~/rn2/bed/blatCi1
    para create spec
    ....
    # When cluster run is done, sort alignments:
    ssh eieio
    cd ~/rn2/bed/blatCi1
    mkdir /tmp/$LOGNAME
    pslSort dirs raw.psl /tmp/$LOGNAME psl/*
    pslReps raw.psl cooked.psl /dev/null -minAli=0.05
    liftUp -nohead lifted.psl ../../jkStuff/liftAll.lft warn cooked.psl
    pslSortAcc nohead chrom /tmp/$LOGNAME lifted.psl

    # Rename to correspond with tables as so and load into database:
    ssh hgwdev
    cd ~/rn2/bed/blatCi1/chrom
    rm -f chr*_blatCi1.psl
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatCi1.psl
    end
    hgLoadPsl rn2 *.psl

    # Make squirt /gbdb/ symlink
    mkdir /gbdb/rn2/squirtSeq
    cd /gbdb/rn2/squirtSeq
    ln -s /cluster/store5/squirt/ci1/ciona.rm.fasta

PRODUCING FUGU FISH ALIGNMENTS (DONE 03/13/03)

    # (Already done, for mm2:)
    # Download sequence to /cluster/store3/fuguSeq from ... and put it on the 
    # cluster local disk at /scratch/hg/fugu on kkstore.
    # Sequence was downloaded from:
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_mask.fasta.Z
    # ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3_prot.fasta.Z
    # mkdir split2.5Mb; cd split2.5Mb;
    # faSplit about ../fugu_v3_mask.fasta 2500000 fuguSplit

    ssh kkr1u00
    rm -rf /iscratch/i/fugu
    mkdir /iscratch/i/fugu
    cp -p /cluster/store3/fuguSeq/split2.5Mb/*.fa /iscratch/i/fugu
    ~kent/bin/iSync

    ssh kk
    mkdir ~/rn2/bed/blatFugu
    cd ~/rn2/bed/blatFugu
    ls -1S /iscratch/i/fugu/* > fugu.lst
    ls -1S /scratch/hg/rn2/trfFa/* > rat.lst
    cp ~/lastRn/bed/blatFugu/gsub .
    mkdir psl
    foreach f (~/rn2/?{,?}/chr*/chr?{,?}{,_random}_?{,?}.fa)
      set c=$f:t:r
      mkdir psl/$c
    end
    gensub2 rat.lst fugu.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check
    # Sort alignments:
    ssh eieio
    cd ~/rn2/bed/blatFugu
    pslCat -dir psl/* \
      | liftUp -type=.psl stdout ~/rn2/jkStuff/liftAll.lft warn stdin \
      | pslSortAcc nohead chrom /cluster/store2/temp stdin

    # load into database:
    ssh hgwdev
    cd ~/rn2/bed/blatFugu/chrom
    foreach i (chr?{,?}{,_random}.psl)
        set r = $i:r
        mv $i ${r}_blatFugu.psl
    end
    hgLoadPsl rn2 *.psl
    mkdir -p /gbdb/rn2/fuguSeq
    cd /gbdb/rn2/fuguSeq
    ln -s /cluster/store3/fuguSeq/fugu_v3_mask.fasta
    cd /cluster/store2/temp
    hgLoadRna addSeq rn2 /gbdb/rn2/fuguSeq/fugu_v3_mask.fasta


MAKE LIFT FILE FOR AGPS (DONE 02/05/03)
    ssh eieio
    cd ~/rn2/jkStuff
    ./jkStuff/agpToLift.pl chrom.sizes ?{,?}/chr?{,?}{,_random}.agp \
      > jkStuff/liftRNOR.lft


LOAD BACTIG POSITIONS (DONE 02/18/03)

    ssh hgwdev
    mkdir -p ~/rn2/bed/bactigPos
    cd ~/rn2/bed/bactigPos
    # Paul Havlak havlak@swan.hgsc.bcm.tmc.edu sent us a BED 4+ email 
    # attachment.  
    # Save the attachment as ~/rn2/bed/bactigPos/Rnor2-1.extreme.fix
    # Fix the 1-based starts to 0-based:
    awk "-F\t" '{printf "%s\t%d\t%s\t%s\t%s\t%s\n", $1, $2-1, $3, $4, $5, $6;}' < Rnor2-1.extreme.fix > bactigPos.bed
    hgLoadBed rn2 bactigPos bactigPos.bed \
      -noBin -sqlTable=$HOME/kent/src/hg/lib/bactigPos.sql


LOAD CPGISSLANDS (DONE 03/06/03)
    ssh eieio
    mkdir -p ~/rn2/bed/cpgIsland
    cd ~/rn2/bed/cpgIsland
    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # copy the tar file to the current directory
    cp ~/lastRn/bed/cpgIsland/cpg_dist.tar .
    tar xvf cpg_dist.tar 
    cd cpg_dist
    gcc readseq.c cpg_lh.c -o cpglh.exe
    cd ..
    foreach f (../../?{,?}/chr?{,?}{,_random}.fa.masked)
      set fout=$f:t:r:r.cpg
      echo running cpglh on $f to $fout
      ./cpg_dist/cpglh.exe $f > $fout.cpg
    end
    # copy filter.awk from a previous release
    cp ~/lastRn/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed
    # load into database:
    ssh hgwdev
    cd ~/rn2/bed/cpgIsland
    hgLoadBed rn2 cpgIsland -tab -noBin \
      -sqlTable=$HOME/kent/src/hg/lib/cpgIsland.sql cpgIsland.bed


LOAD SOFTBERRY GENES (DONE 02/04/03)
     cd /cluster/store4/rn2/bed
     mkdir softberry
     cd softberry
     wget ftp://www.softberry.com/pub/SC_RAT_JAN03/Softb_rat_gff_j03.tar.gz
     gunzip -c Softb_rat_gff_j03.tar.gz | tar xvf -
     ldHgGene rn2 softberryGene chr*.gff
     hgPepPred rn2 softberry *.protein
     hgSoftberryHom rn2 *.protein

LOAD GENEID GENES (DONE 04/01/03)
    mkdir -p ~/rn2/bed/geneid/download
    cd ~/rn2/bed/geneid/download
    foreach f (~/rn2/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/geneid_v1.1/$chr.gtf
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/geneid_v1.1/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn2 geneid download/*.gtf -exon=CDS
    hgPepPred rn2 generic geneidPep download/*-fixed.prot

SGP GENE PREDICTIONS (DONE 2003-05-19 - Hiram)  (RELOADED 10/28/03 angie)
    mkdir -p ~/rn2/bed/sgp/download
    cd ~/rn2/bed/sgp/download
    foreach f (~/rn2/?{,?}/chr?{,?}{,_random}.fa)
      set chr = $f:t:r
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/SGP/humangp20021114/$chr.gtf
      wget http://genome.imim.es/genepredictions/R.norvegicus/rnJan2003/SGP/humangp20021114/$chr.prot
    end
    # Add missing .1 to protein id's
    foreach f (*.prot)
      perl -wpe 's/^(>chr\w+)$/$1.1/' $f > $f:r-fixed.prot
    end
    cd ..
    ldHgGene rn2 sgpGene download/*.gtf -exon=CDS
    hgPepPred rn2 generic sgpPep download/*-fixed.prot

SGP GENES (UPDATE 1/18/2006)
    sgpPep table dropped, replaced by hgc generated protein seq in browser

TIGR GENE INDEX (TODO)
    mkdir -p ~/rn2/bed/tigr
    cd ~/rn2/bed/tigr
    wget ftp://ftp.tigr.org/private/NHGI_mgi_jiashu/TGI_track_RatGenome_Feb2002.tgz
    gunzip -c TGI_track_RatGenome_Feb2002.tgz | tar xvf -
    foreach f (*cattle*)
      set f1 = `echo $f | sed -e 's/cattle/cow/g'`
      mv $f $f1
    end
    foreach o (rat cow human pig rat)
      setenv O $o
      foreach f (chr*_$o*s)
        tail +2 $f | perl -wpe 's /THC/TC/; s/(TH?C\d+)/$ENV{O}_$1/;' > $f.gff
      end
    end
    ldHgGene -exon=TC rn2 tigrGeneIndex *.gff


LOAD STS MAP (todo)
     - login to hgwdev
      cd ~/rn2/bed
      rn2 < ~/src/hg/lib/stsMap.sql
      mkdir stsMap
      cd stsMap
      bedSort /projects/cc/hg/mapplots/data/tracks/build28/stsMap.bed stsMap.bed
      - Enter database with "rn2" command.
      - At mysql> prompt type in:
          load data local infile 'stsMap.bed' into table stsMap;
      - At mysql> prompt type

LOAD MGI IDs (TODO)
      - The Locuslink ID to MGI IDs converstion data file,
        LL2MGI.txt, from Jackson Lab should be found under
        ~/rn2/bed/refSeq
      - login to hgwdev
      
      cd ~/rn2/bed/refSeq
      rn2 < ~/src/hg/lib/mgiID.sql
      - Enter database with "rn2" command.
      - At mysql> prompt type in:
          load data local infile 'LL2MGI.txt' into table MGIid;
      - At mysql> prompt type
          quit

LOAD CHROMOSOME BANDS (todo)
      - login to hgwdev
      cd /cluster/store4/rn2/bed
      mkdir cytoBands
      cp /projects/cc/hg/mapplots/data/tracks/build28/cytobands.bed cytoBands
      rn2 < ~/src/hg/lib/cytoBand.sql
      Enter database with "rn2" command.
      - At mysql> prompt type in:
          load data local infile 'cytobands.bed' into table cytoBand;
      - At mysql> prompt type
          quit

LOAD RATREF TRACK (todo)
    First copy in data from kkstore to ~/rn2/bed/ratRef.  
    Then substitute 'genome' for the appropriate chromosome 
    in each of the alignment files.  Finally do:
       hgRefAlign webb rn2 ratRef *.alignments

LOAD AVID RAT TRACK (todo)
      ssh cc98
      cd ~/rn2/bed
      mkdir avidRat
      cd avidRat
      wget http://pipeline.lbl.gov/tableCS-LBNL.txt
      hgAvidShortBed *.txt avidRepeat.bed avidUnique.bed
      hgLoadBed avidRepeat avidRepeat.bed
      hgLoadBed avidUnique avidUnique.bed

LOAD SNPS (TODO)
      - ssh hgwdev
      - cd ~/rn2/bed
      - mkdir snp
      - cd snp
      - Download SNPs from ftp://ftp.ncbi.nlm.nih.gov/pub/sherry/rat.b27.out.gz
      - Unpack.
        createBed < rat.b27.out > snpNih.bed
        hgLoadBed rn2 snpNih snpNih.bed

LOAD ENSEMBL ESTs (TODO)
     ln -s /cluster/store4/rn2 ~/rn2
     mkdir -p ~/rn2/bed/ensembl
     cd ~/rn2/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/rat-est.gz
     wget http://www.ebi.ac.uk/~stabenau/rat-est.pep.gz
     gunzip -c rat-est.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > rat-est-fixed.gtf
     ldHgGene rn2 ensEst rat-est-fixed.gtf
> The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
     # pick out the transcript= attribute -- that's the id to use:
     # also remove the first line:
     gunzip -c rat-est.pep.gz | tail +2 | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       rat-est-fixed.pep
     hgPepPred rn2 generic ensEstPep rat-est-fixed.pep

LOAD ENSEMBLE GENES (TODO)
     mkdir -p ~/rn2/bed/ensembl
     cd ~/rn2/bed/ensembl
     wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.gz
     wget http://www.ebi.ac.uk/~stabenau/rat-ensembl.pep.gz
     gunzip -c rat-ensembl.gz | \
       perl -w -p -e 's/^(\w)/chr$1/' > rat-ensembl-fixed.gtf
     ldHgGene rn2 ensGene rat-ensembl-fixed.gtf
> rat-ensembl contains stopcodons, due to some glitches in our
> genebuild. The id behind '>' is internal and was not in our gtf dump, so
> you have to do some more parsing.
# pick out the transcript= attribute -- that's the id to use:
# also remove the first line:
     tail +2 rat-ensembl.pep | \
       perl -w -p -e 's/^\>gene_id=.*transcript=(\w+)\s+.*$/\>$1/' > \
       rat-ensembl-fixed.pep
     hgPepPred rn2 generic ensPep rat-ensembl-fixed.pep

LOAD RNAGENES (todo)
      - login to hgwdev
      - cd ~kent/src/hg/lib
      - rn2 < rnaGene.sql
      - cd /cluster/store4/rn2/bed
      - mkdir rnaGene
      - cd rnaGene
      - download data from ftp.genetics.wustl.edu/pub/eddy/pickup/ncrna-oo27.gff.gz
      - gunzip *.gz
      - liftUp chrom.gff ../../jkStuff/liftAll.lft carry ncrna-oo27.gff
      - hgRnaGenes rn2 chrom.gff

LOAD EXOFISH (todo)
     - login to hgwdev
     - cd /cluster/store4/rn2/bed
     - mkdir exoFish
     - cd exoFish
     - rn2 < ~kent/src/hg/lib/exoFish.sql
     - Put email attatchment from Olivier Jaillon (ojaaillon@genoscope.cns.fr)
       into /cluster/store4/rn2/bed/exoFish/all_maping_ecore
     - awk -f filter.awk all_maping_ecore > exoFish.bed
     - hgLoadBed rn2 exoFish exoFish.bed

LOAD GENIE (TODO)
     mkdir -p ~/rn2/bed/genieAlt
     cd ~/rn2/bed/genieAlt
     wget http://www.neomorphic.com/mgap/mgscv3/gtf/mgscv3.genie.gtf.tgz
     gunzip -c mgscv3.genie.gtf.tgz | tar xvf -
     ldHgGene rn2 genieAlt mgscv3.genie.gtf/chr*.gtf
     wget http://www.neomorphic.com/mgap/mgscv3/fa/mgscv3.aa.tgz
     gunzip -c mgscv3.aa.tgz | tar xvf -
     hgPepPred rn2 genie geniePep chr*.aa.fa

LOAD GENIE CLONE BOUNDS (TODO)
     mkdir -p ~/rn2/bed/genieBounds
     cd ~/rn2/bed/genieBounds
     wget http://www.neomorphic.com/mgap/mgscv3/cb.bed/mgscv3_cb.bed.tgz
     gunzip -c mgscv3_cb.bed.tgz | tar xvf -
     - Trim the track definition from each file (these are actually custom 
       track files):
     foreach c (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 X Un)
       tail +2 chr${c}_cb.bed > chr${c}_cb-fixed.bed
     end
     hgLoadBed rn2 genieBounds *-fixed.bed

LOAD SOFTBERRY GENES (todo)
     - ln -s /cluster/store4/rn2 ~/rn2
     - cd ~/rn2/bed
     - mkdir softberry
     - cd softberry
     - get ftp://www.softberry.com/pub/SC_MOU_NOV01/softb_mou_genes_nov01.tar.gz
     ldHgGene rn2 softberryGene chr*.gff
     hgPepPred rn2 softberry *.protein
     hgSoftberryHom rn2 *.protein

LOAD GENOMIC DUPES (todo)
o - Load genomic dupes
    ssh hgwdev
    cd ~/rn2/bed
    mkdir genomicDups
    cd genomicDups
    wget http://codon/jab/web/takeoff/oo33_dups_for_kent.zip
    unzip *.zip
    awk -f filter.awk oo33_dups_for_kent > genomicDups.bed
    hgsql rn2 < ~/src/hg/lib/genomicDups.sql
    hgLoadBed rn2 -oldTable genomicDups genomicDupes.bed

LOAD RGD CURATED GENES TRACK
    - cd rn2
    - cd bed
    - mkdir rgdGene
    - Browse to http://zephyrus.brc.mcw.edu/cgi-bin/pub/viewcvs.cgi/pub_gbrowse/gff_files/RGD_curated_genes.gff
        This is a web-based CVS page. Click the download link and save the file to ~/rn2/bed/RGD_curated_genes.gff
     - Now massage the data format using:
         rn2/bed/rgdGene/massage.pl
     - Load the data:
        ldHgGene rn2 rgdGene Fixed_RGD_Curated_genes.gff
     - Create the link table for searching
        In mysql for the rn2 database do:
        create table rgdLink (id int primary key, name varchar(32) not null);
        LOAD DATA LOCAL INFILE 'RGD.links' into table rgdLink; 
     

FAKING DATA FROM PREVIOUS VERSION
(This is just for until proper track arrives.  Rescues about
97% of data  Just an experiment, not really followed through on).

o - Rescuing STS track:
     - log onto hgwdev
     - mkdir ~/rn2/rescue
     - cd !$
     - mkdir sts
     - cd sts
     - bedDown hg3 mapGenethon sts.fa sts.tab
     - echo ~/rn2/sts.fa > fa.lst
     - pslOoJobs ~/rn2 ~/rn2/rescue/sts/fa.lst ~/rn2/rescue/sts g2g
     - log onto cc01
     - cc ~/rn2/rescue/sts
     - split all.con into 3 parts and condor_submit each part
     - wait for assembly to finish
     - cd psl
     - mkdir all
     - ln ?/*.psl ??/*.psl *.psl all
     - pslSort dirs raw.psl temp all
     - pslReps raw.psl contig.psl /dev/null
     - rm raw.psl
     - liftUp chrom.psl ../../../jkStuff/liftAll.lft carry contig.psl
     - rm contig.psl
     - mv chrom.psl ../convert.psl

# CREATE FULL TEXT INDEX FOR KNOWN GENES (DONE 1/19/2006 JK)
# This depends on the go and uniProt databases as well as 
# the kgAlias and kgProAlias tables.  The hgKgGetText takes
# about 5 minutes when the database is not too busy.  The rest
# is real quick.
     ssh hgwdev
     cd /cluster/data/rn2/bed
     mkdir -p knownGene/index
     cd -p knownGene/index
     hgKgGetText rn2 knownGene.text
     ixIxx knownGene.text knownGene.ix knownGene.ixx
     ln -s /cluster/data/rn2/bed/knownGene/index/knownGene.ix /gbdb/rn2/knownGene.ix
     ln -s /cluster/data/rn2/bed/knownGene/index/knownGene.ixx /gbdb/rn2/knownGene.ixx

# MYTOUCH FIX - jen - 2006-01-24
  sudo mytouch rn2 geneidPep 0403251000.00
  sudo mytouch rn2 twinscanPep 0403251000.00
  sudo mytouch rn2 dupSpMrna 0403251000.00
  sudo mytouch rn2 keggPathway 0403251000.00
  sudo mytouch rn2 kgAlias 0403251000.00
  sudo mytouch rn2 kgProtAlias 0403251000.00
  sudo mytouch rn2 kgXref 0403251000.00
  sudo mytouch rn2 geneidPep 0404031400.00
  sudo mytouch rn2 twinscanPep 0404031400.00

  Other fixes at same time:
  Adjusted all.joiner rule to remove false error
  added !rn to ensemblTranscriptId
  $kgDb,!rn2.knownToEnsembl.value chopAfter=.
  check was comparing an empty table to a track (ensGene) that does
  not exist for this database.

