# for emacs: -*- mode: sh; -*-


# This file describes how we made the browser database on the 
# Fugu Rubripes (Japanese pufferfish), whole genome shotgun assembly dated 
# 26 August 2002, from the DOE Joint Genome Institute (v3.0).
# This release contains ~320Mbases, in 20,378 scaffolds.
# Size distribution is: 2 scaffolds ~1Mbase, 17 are 500-1000kbase.
# Kate Rosenbloom 4/03

#  NOTE:  this doc may have genePred loads that fail to include
#  the bin column.  Please correct that for the next build by adding
#  a bin column when you make any of these tables:
#
#  mysql> SELECT tableName, type FROM trackDb WHERE type LIKE "%Pred%";
#  +-----------+---------------------+
#  | tableName | type                |
#  +-----------+---------------------+
#  | ensGene   | genePred ensPep     |
#  | genscan   | genePred genscanPep |
#  +-----------+---------------------+


# DOWNLOAD THE SEQUENCE FROM JGI (04/22/03 KRR)

    ssh eieio
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
    mkdir $fugudir
    cd $fugudir

    # NOTE: there are two downloads available on the site -- one
    # is named indicating it is masked, and contains additional N's.
    # The other file (the one we are using) contains lower-case,
    # indicating it is soft-masked. (As an aside, the mask locations are not
    # consistent between the two files).
    # We use the soft-masked file, since it has more sequence; however
    # we will translate to upper case and repeat mask ourselves, 
    # so that we can generate a Repeats track.
    #
    # The file consists of fasta records, each named scaffold_#.
    # Unfortunately, the ordering of scaffolds is alphabetic, not numeric,
    # so scaffold_1 is followed by scaffold_10, not scaffold_2.
    # Also, the scaffold numbering has holes -- the highest numbered
    # scaffold is 32607. 

    wget ftp://ftp.jgi-psf.org/pub/JGI_data/Fugu/fugu_v3.fasta.Z
    ln -s $fugudir ~/fr1
    cd ~/fr1
    gunzip fugu_v3.fasta.Z

    # trim off garbage at end of file (list of scaffolds),
    # this appears to be an artifact of the release process

    mv fugu_v3.fasta fugu_v3.orig.fasta
    grep -v '^scaffold' fugu_v3.orig.fasta > fugu_v3.fasta

    # force to upper case (see above), but preserve lower case headers
    tr '[a-z]' '[A-Z]' < fugu_v3.fasta | \
        sed 's/>SCAFFOLD_/>scaffold_/' > fugu_v3.upper.fasta


# SPLIT THE FASTA FILE INTO SCAFFOLDS FOR FURTHER PROCESSING (DONE KRR)

    ssh eieio
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
    cd $fugudir
    mkdir scaffolds

    # NOTE: must give a record count larger than the number of scaffolds
    # next time, use "faSplit byName" to do this
    faSplit sequence fugu_v3.upper.fasta 30000 scaffolds/

    # rename files to match scaffold (in fasta header)
    # and store in directories (00 .. 32) for convenience and readability
    # This produces directories with 200 - 1000 files per dir.
    # The directory is the leading two digits of the scaffold number,
    # (when represented as a zero-filled 5-digit number)
    cd scaffolds
    foreach d1 (0 1 2 3)
        foreach d2 (0 1 2 3 4 5 6 7 8 9)
            foreach file ($d1$d2*.fa)
                set num = `sed -n '1s/>scaffold_//p' $file`
                set dir = `printf "%05d" $num | sed 's/...$//'`
                mkdir -p $dir
                echo "moving $file to $dir/scaffold_$num"
                mv $file $dir/scaffold_${num}.fa
            end
        end
    end

    # remake a whole-genome fasta file, with the scaffold records
    # ordered more sensibly (numeric, not alphabetic)
    set ordered_file = fugu_v3.ordered.fa
    cp /dev/null $ordered_file
    cd scaffolds
    foreach d ([0-9][0-9])
        set filelist = `(cd $d; ls *.fa | sort -n -k 1.10)`
        foreach f ($filelist) 
            echo "concatenating $f"
            cat $d/$f >> ../$ordered_file
        end
    end
    cd ..

    # split into ~500KB "super-scaffolds"
    # each file has one or more scaffolds. Fasta records are preserved
    # This creates 577 files, size range ~51000 to ~1.2Mb
    rm -fr superscaffolds
    mkdir superscaffolds

    faSplit about $ordered_file 500000 superscaffolds/ss_



# RUN REPEAT MASKER ON THE SUPER-SCAFFOLDS (6/2/03 KRR)
    # note: fugu library ("puffer.lib") is dated 7/9/2002
    ssh eieio
    cd ~/fr1

    # make the run directory, output directory, and job list
    mkdir -p RMRun/Un
    cp /dev/null RMRun/RMJobs
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
    cd superscaffolds
    foreach f (ss_*.fa)
        echo /cluster/bin/scripts/RMFugu \
                $fugudir/superscaffolds $f $fugudir/RMRun/Un \
               '{'check out line+ $fugudir/RMRun/Un/$f.out'}' \
              >> ../RMRun/RMJobs
    end

    # do the run
    ssh kk
    cd ~/fr1/RMRun
    para create RMJobs
    para try
    para check
    para push
    para check,...


# PROCESS SUPER-SCAFFOLDS FOR SIMPLE REPEATS (6/2/03 KRR)
    # TRF runs pretty quickly now... it takes a few hours total runtime, 
    # so instead of binrsyncing and para-running, just do this on the
    # local fileserver
    ssh eieio
    mkdir ~/fr1/bed/simpleRepeat
    cd ~/fr1/bed/simpleRepeat
    mkdir trf
    cp /dev/null jobs.csh
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
    foreach f ($fugudir/superscaffolds/*.fa)
        set fout = $f:t:r.bed
        echo $fout
        echo "/cluster/bin/i386/trfBig -trf=/cluster/bin/i386/trf $f /dev/null -bedAt=trf/$fout -tempDir=/tmp" \
        >> jobs.csh
    end
    tcsh jobs.csh >&! jobs.log &
    # check on this with
    tail -f jobs.log
    wc -l jobs.csh
    ls -1 trf | wc -l


# FILTER SIMPLE REPEATS INTO MASK (6/3/03 KRR)
    # make a filtered version # of the trf output: 
    # keep trf's with period <= 12:
    ssh eieio
    cd ~/fr1/bed/simpleRepeat
    mkdir -p trfMask
    foreach f (trf/*.bed)
        echo "filtering $f"
        awk '{if ($5 <= 12) print;}' $f > trfMask/$f:t
    end


# MASK SUPER-SCAFFOLDS USING REPEATMASKER AND FILTERED TRF FILES (6/3/03 KRR)
    ssh eieio
    cd ~/fr1
    mkdir superscaffolds.masked
    foreach p (superscaffolds/ss_*.fa)
        set f = $p:t
        echo "masking $f"
        maskOutFa $p RMRun/Un/$f.out superscaffolds.masked/$f -soft
        maskOutFa superscaffolds.masked/$f bed/simpleRepeat/trfMask/${f:r}.bed superscaffolds.masked/$f -softAdd
    end


# CREATE FASTA FOR UNORDERED CHROM FROM MASKED SUPER-SCAFFOLDS (6/3/03 KRR)
# Note: scaffolds are separated by 1000 Bp gaps
    ssh eieio
    cd ~/fr1

    # remake a whole-genome fasta file from the masked supser-scaffolds
    set masked_file = fugu_v3.masked.fa
    cd superscaffolds.masked
    cat `ls ss_*.fa | sort -n -k 1.4` > ../$masked_file
    cd ..
    
    # generate AGP and lift files from the chromosome fasta 

    scaffoldFaToAgp $masked_file
    # gap size is 1000, total gaps: 20379
    # chrom size is 349519338

    mkdir Un
    mv fugu_v3.masked.agp Un/chrUn.agp
    mv fugu_v3.masked.lft Un/chrUn.lft

    # reran scaffoldFaToAgp with new version to generate a gap
    # file that contains gaps within contig
    mv fugu_v3.masked.gap Un/chrUn.gap

    # Create chromosome FA file from AGP and file of masked scaffolds
    cd Un
    agpToFa -simpleMultiMixed chrUn.agp chrUn chrUn.fa ../fugu_v3.masked.fa


# CREATING DATABASE (5/2/03 KRR)
    # Create the database.
    ssh hgwdev
    echo 'create database fr1' | hgsql ''
    # Make a semi-permanent read-only alias:
    alias fr1 "mysql -u hguser -phguserstuff -A fr1"
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql


# STORE SEQUENCE AND ASSEMBLY INFORMATION (6/3/03 KRR)

    # Translate to nib
    ssh eieio
    cd ~/fr1
    mkdir nib
    faToNib -softMask Un/chrUn.fa nib/chrUn.nib

    # Make symbolic links from /gbdb/fr1/nib to the real nibs.
    ssh hgwdev
    mkdir -p /gbdb/fr1/nib
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
    ln -s $fugudir/nib/chrUn.nib  /gbdb/fr1/nib

    # Load /gbdb/fr1/nib paths into database and save size info.
     ssh hgwdev
     hgsql fr1  < ~/src/hg/lib/chromInfo.sql
     cd ~/fr1

    # NOTE: last arg here may be in error
     hgNibSeq -preMadeNib fr1 /gbdb/fr1/nib chrUn.nib
     echo "select chrom,size from chromInfo" | hgsql -N fr1 > chrom.sizes

    # create assembly and gap tracks
    ssh hgwdev
    hgGoldGapGl -noGl fr1 ~ fr1 

    # reload gap track so that it also includes gaps for N's in scaffolds
    hgLoadGap fr1 /cluster/data/fr1
    
    # make Map Contigs track for comparison to Assembly track
    # cd /cluster/data/fr1/Un
    # mkdir -p lift
    # grep scaffold chrUn.lft > lift/ordered.lft
    # hgCtgPos fr1 /cluster/data/fr1


# CREATING GRP TABLE FOR TRACK GROUPING (5/2/03 KRR)
    # Copy all the data from the table "grp" 
    # in the existing database "rn1" to the new database
    ssh hgwdev
    echo "create table grp (PRIMARY KEY(NAME)) select * from rn1.grp" \
      | hgsql fr1


# CREATE REPEAT TRACKS (6/3/03 KRR)

    ssh eieio
    cd ~/fr1

    # merge and lift up the repeatmasker output files to chrom coordinates
    liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun/Un/*.out

    # load into the database (chrUn_rmsk table)
    ssh hgwdev
    hgLoadOut fr1 Un/chrUn.fa.out

    # lift the simple repeat output to chrom coordinates
    ssh eieio
    cd ~/fr1
    cd bed/simpleRepeat
    liftUp simpleRepeat.bed ~/fr1/Un/chrUn.lft warn trf/*.bed

    # load into the database (simpleRepeat table)
    ssh hgwdev
    cd ~/fr1/bed/simpleRepeat
    hgLoadBed fr1 simpleRepeat simpleRepeat.bed \
      -sqlTable=$HOME/src/hg/lib/simpleRepeat.sql



# MAKE GCPERCENT (6/3/03 KRR)
     ssh hgwdev
    set fugudir = /cluster/store5/Fugu_Rubripes_V3
     mkdir -p $fugudir/bed/gcPercent
     cd $fugudir/bed/gcPercent
     hgsql fr1  < ~/src/hg/lib/gcPercent.sql

     # load gcPercent table
     hgGcPercent fr1 ../../nib


# MAKE HGCENTRALTEST ENTRY AND TRACKDB TABLE FOR FUGU (5/6/03 KRR)
    echo 'INSERT INTO defaultDb VALUES ("Fugu", "fr1");' \
      | hgsql -h genome-testdb hgcentraltest

    # Warning: must genome and organism fields must correspond
    # with defaultDb values
    # Note: for next assembly, set scientificName column to "Fugu rubripes"
    echo 'INSERT INTO dbDb \
        (name, description, nibPath, organism, \
                defaultPos, active, orderKey, genome) values \
        ("fr1", "Aug. 2002", "/gbdb/fr1/nib", "Fugu", \
               "chrUn:827700-845800", 1, 10, "Fugu");' \
      | hgsql -h genome-testdb hgcentraltest

    # Make trackDb table so browser knows what tracks to expect:
    ssh hgwdev
    cd ~/src/hg/makeDb/trackDb
    cvs up -d -P

    # Edit that makefile to add fr1 in all the right places and do
    make update

    # go public on genome-test
    #make alpha
    cvs commit makefile

    # Add trackDb directories
    mkdir fugu
    mkdir fugu/fr1
    cvs add fugu
    cvs add fugu/fr1
    cvs commit fugu


# MAKE RELATIONAL RNA TABLES

    hgLoadRna new fr1


# MAKE HGCENTRALTEST BLATSERVERS ENTRY FOR FUGU (6/4/03 KRR)
    ssh hgwdev
    # Get appropriate hostname from cluster admins
    echo 'insert into blatServers values("fr1", "blat11", "17783", "1"); \
          insert into blatServers values("fr1", "blat11", "17782", "0");' \
      | hgsql -h genome-testdb hgcentraltest


# MAKE AND STORE mRNA ALIGNMENTS  (6/8/03 KRR)

    # Load up the local disks of the small cluster with  mrna.fa
    # and masked super-scaffolds
    # from /cluster/store5/mrna.134/org/Takifugu_rubripes
    # note: there are 76 mrna sequences, and 24398 EST's
    ssh kkr1u00
    cd /iscratch/i/fugu
    set mrnaDir = /cluster/store5/mrna.134/org/Takifugu_rubripes
    mkdir mrna 
    cp -p $mrnaDir/mrna.fa mrna
    mkdir trfFa 
    cp -p ~/fr1/superscaffolds.masked/*.fa trfFa
    # note: this isn't currently working for me 
    # (permission errors) -- Hiram did it
    ~kent/bin/iSync

    # generate alignment job 
    # TODO: try aligning whole mrna file against whole chrUn nib
    # on file server (eieio), instead of using cluster.
    # this would be simpler, and probably fast enough
    cd ~/fr1/bed
    mkdir mrna
    cd mrna
    mkdir psl
    ls -1S /iscratch/i/fugu/trfFa/*.fa > genome.lst
    ls -1S /iscratch/i/fugu/mrna/*.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -ooc={check in exists /scratch/hg/h/11.ooc} {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP 
'_EOF_' 
    # << happy emacs
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    # this just takes a few minutes

    ssh eieio
    cd ~/fr1/bed/mrna
    pslSort dirs raw.psl /tmp psl
        # psl with 578 files
        # 578 files in 1 dirs
        # Got 578 files 24 files per mid file
        # Writing /tmp/tmp0.psl
        # ...
        # Writing /tmp/tmp24.psl
        # writing raw.psl
        # Cleaning up temp files

    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl scaffolds.psl \
      /dev/null
        # Processing raw.psl to scaffolds.psl and /dev/null
        # Processed 161 alignments

    # lift up PSL files to chrom coordinates
    liftUp -nohead all_mrna.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl
        # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft
        # Lifting scaffolds.psl

    pslSortAcc nohead chrom /tmp all_mrna.psl
        # Processing all_mrna.psl
        # Processed 89 lines into 1 temp files

    # load mrna tables
    ssh hgwdev
    cd ~/fr1/bed/mrna/chrom

    # rename psl file to required format (must be chr*_mrna)
    mv chrUn.psl chrUn_mrna.psl

    # load alignments
    hgLoadPsl -noTNameIx fr1 chrUn_mrna.psl
    cd ..
    hgLoadPsl fr1 all_mrna.psl -nobin

    # prepare for data load
    set mrnaDir = mrna.134
    mkdir /gbdb/fr1/mrna.134
    #ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes /gbdb/fr1/$mrnaDir
    ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes/mrna.fa /gbdb/fr1/$mrnaDir
    # load mRna into database
    # WARNING: had to use -ignore flag... check on this
    hgLoadRna add -type=mRNA fr1 /gbdb/fr1/$mrnaDir/mrna.fa \
        /cluster/store5/$mrnaDir/org/Takifugu_rubripes/mrna.ra
        # Adding data of type: mRNA ...


# MAKE AND STORE EST ALIGNMENTS  (6/8/03 KRR)

    # setup BlueArc with EST files
    ssh kk
    mkdir -p /cluster/bluearc/fugu/fr1/est
    cd /cluster/bluearc/fugu/fr1/est
    
    # split up est file into 100Kb chunks 
    # (makes ~120 query files * ~600 target superscaffolds = 70000 jobs)
    faSplit about  /cluster/store5/mrna.134/org/Takifugu_rubripes/est.fa \
                                100000 e

    # TODO: try aligning est files against whole chrUn nib
    # this would be easier on the filesystem, and probably fast enough
    cd ~/fr1/bed
    mkdir est
    cd est
    mkdir psl
    ls -1S /iscratch/i/fugu/trfFa/*.fa > genome.lst
    ls -1S /cluster/bluearc/fugu/fr1/est/*.fa > est.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -ooc={check in exists /scratch/hg/h/11.ooc} {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP 
'_EOF_' 
    # << happy emacs
    gensub2 genome.lst est.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check...
    # just a few minutes to complete

    # process alignments
    ssh eieio
    cd ~/fr1/bed/est

    pslSort dirs raw.psl /tmp psl
    pslReps -minAli=0.98 -sizeMatters -nearTop=0.005 raw.psl scaffolds.psl \
      /dev/null

    # lift up PSL files to chrom coordinates
    liftUp -nohead all_est.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl
        # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft
        # Lifting scaffolds.psl

    pslSortAcc nohead chrom /tmp all_est.psl
        # Processing all_est.psl
        # Processed 20501 lines into 1 temp files

    # load into database
    ssh hgwdev
    cd ~/fr1/bed/est/chrom

    # rename psl file to required format (must be chr*_est)
    mv chrUn.psl chrUn_est.psl

    # load alignments
    hgLoadPsl -noTNameIx fr1 chrUn_est.psl
    cd ..
    hgLoadPsl fr1 all_est.psl -nobin

    set mrnaDir = mrna.134
    ln -s /cluster/store5/$mrnaDir/org/Takifugu_rubripes/est.fa /gbdb/fr1/$mrnaDir
    
    cd ~/fr1/bed/est
    rm *.tab
    hgLoadRna add -type=EST fr1 /gbdb/fr1/$mrnaDir/est.fa \
        /cluster/store5/$mrnaDir/org/Takifugu_rubripes/est.ra
    unset mrnaDir

    # note: there is no Genbank RefSeq directory for this organism

# PRODUCE CROSS_SPECIES MRNA ALIGNMENT (6/9/03 KRR)
    # Aligns non-Fugu mRNA's  against the masked genome
    # This uses Genbank mRNA files already downloaded 
    # to /cluster/store5/genbank.134
    # Files are unpacked to /cluster/store5/mrna.134

    # use fileserver where genbank.134 resides
    ssh eieio
    set genbankdir = /cluster/store5/genbank.134
    set mrnadir = /cluster/store5/mrna.134
    cd $mrnaDir

    # generate filter file for unpacking entries
    # note: skip this if fuguXenoRna.fil exists
    cat << '_EOF_' > fuguXenoRna.fil
restrict mol=mRNA & !(org="Takifugu rubripes")
hide mol
'_EOF_' 
    # << happy emacs

    # unpack non-Fugu mRNAs
    gunzip -c $genbankdir/gb{pri,rod,v,mam,inv,bct,htc,pat,phg,pln}* \
        | gbToFaRa fuguXenoRna.fil \
                fuguXenoRna.fa fuguXenoRna.ra fuguXenoRna.ta stdin

    # copy masked chromosome nib to bluearc for cluster run
    # NOTE: probably want to do this earlier
    # so it is available for all alignments
    ssh kk
    cd /cluster/bluearc/fugu/fr1
    mkdir chromNib
    cp ~/fr1/nib/* chromNib

    # split masked scaffold file into 50Mb chunks
    # to produce 7 files
    mkdir trfFa
    faSplit about ~/fr1/fugu_v3.masked.fa 50000000 trfFa/s

    # split big xeno mrna file into 700Kb chunks on bluearc for cluster run
    # (makes ~1000 query files * 7 target files = ~7000 jobs)
    # This is not too many to handle in a single directory (psl)
    mkdir xenoRnaSplit
    faSplit about $mrnadir/fuguXenoRna.fa 700000 xenoRnaSplit/m

    cd ~/fr1/bed
    mkdir xenoMrna
    cd xenoMrna
    mkdir psl
    ls -1S /cluster/bluearc/fugu/fr1/trfFa/s*.fa > genome.lst
    ls -1S /cluster/bluearc/fugu/fr1/xenoRnaSplit/m*.fa > mrna.lst
    cat << '_EOF_' > gsub
#LOOP
/cluster/bin/i386/blat -q=rnax -t=dnax -mask=lower {check in exists+ $(path1)} {check in line+ $(path2)} {check out line+ psl/$(root1)_$(root2).psl}
#ENDLOOP 
'_EOF_'
    # << happy emacs
    gensub2 genome.lst mrna.lst gsub spec
    para create spec
    para try
    para check
    para push
    para check...
        # Completed: 6629 of 6629 jobs
        # Avg. job: 2.7 min
        # Longest job: 4.7 min
        # Submission to last job: 49 min

    # process alignments
    ssh eieio
    cd ~/fr1/bed/xenoMrna

    # sort alignmnents
    pslSort dirs raw.psl /tmp psl
        # Got 6629 files 81 files per mid file

    # analyse repeats and generate genome-wide best alignment
    # note different parameters from previous alignments
    pslReps -minAli=0.25 raw.psl scaffolds.psl /dev/null
        # ...........Processed 1675350 alignments

    # lift up PSL files to chrom coordinates
    liftUp -nohead chrom.psl ~/fr1/Un/chrUn.lft warn scaffolds.psl
        # Got 40758 lifts in /cluster/home/kate/fr1/Un/chrUn.lft

    # extract psl file for each accession, into a dir
    pslSortAcc nohead chrom /tmp chrom.psl
    
    # merge psl files
    pslCat -dir chrom > xenoMrna.psl

    # load alignments into database
    ssh hgwdev
    cd ~/fr1/bed/xenoMrna
    rm *.tab
    hgLoadPsl fr1 xenoMrna.psl

    # load xeno rna into database
    set mrnadir = mrna.134
    set datadir = /cluster/store5
    ln -s $datadir/$mrnadir/fuguXenoRna.fa /gbdb/fr1/$mrnaDir
    hgLoadRna add -type=xenoRna fr1 /gbdb/fr1/$mrnadir/fuguXenoRna.fa \
         $datadir/$mrnadir/fuguXenoRna.ra


# HUMAN HG16 BLAT ALIGNMENT (DONE 2003-09-09 kate)
    ssh kk
    mkdir /cluster/data/fr1/bed/blatHg16
    cd /cluster/data/fr1/bed/blatHg16
    # create output dir and subdirs (prevent directory overload)
    mkdir psl
    foreach f (/iscratch/i/fugu/trfFa/*.fa)
      set c=$f:t:r
      echo $c
      mkdir -p psl/$c
    end
    mkdir run
    cd run
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
        # 578 files
    ls -1S /scratch/hg/gs.17/build34/trfFa/*.fa > human.lst
        # 491 files
    cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/fr1/bed/blatHg16/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'

    gensub2 fugu.lst human.lst gsub spec
        # 283798 lines
    para create spec
    para try
    para check
    para push
    para check

    # merge to single Fugu chrUn chrom
    ssh eieio
    cd /cluster/data/fr1/bed/blastHg16
    pslCat -dir psl/ss_* | \
        liftUp -type=.psl stdout \
                /cluster/data/fr1/Un/chrUn.lft warn stdin | \
        pslSortAcc nohead chrom temp stdin
        # 15 minutes ?
    
    # load into database
    cd $fr1dir/chrom
    hgLoadPsl -noTNameIx fr1 chrUn_blatHg15.psl
    rm -fr psl


# TETRAODON BLAT ALIGNMENT (DONE 2003-09-29 kate)
    # Blat using 2002 Tetraodon nigroviridis (V6) from genoscope
    # repeat and trf-masked contigs
    ssh kk
    mkdir /cluster/data/fr1/bed/blatTetra
    cd /cluster/data/fr1/bed/blatTetra
    # create output dir and subdirs (prevent directory overload)
    mkdir psl
    foreach f (/iscratch/i/fugu/trfFa/*.fa)
      set c=$f:t:r
      echo $c
      mkdir -p psl/$c
    end
    mkdir run
    cd run
    ls -1S /iscratch/i/fugu/trfFa/*.fa > fugu.lst
        # 578 files
    ls -1S /iscratch/i/tetra/*.fa > tetra.lst
        # 632 files
    cat << 'EOF' > gsub
#LOOP
/cluster/bin/i386/blat -mask=lower -qMask=lower -q=dnax -t=dnax {check in line+ $(path1)} {check in line+ $(path2)} {check out line+ /cluster/data/fr1/bed/blatTetra/psl/$(root1)/$(root1)_$(root2).psl}
#ENDLOOP
'EOF'

    gensub2 fugu.lst tetra.lst gsub spec
    wc -l spec
        #  365296 spec
    para create spec
    para try
    para check
    para push
    para check

    # merge to single Fugu chrUn chrom
    ssh eieio
    cd /cluster/data/fr1/bed/blatTetra
    pslCat -dir psl/ss_* | \
        liftUp -type=.psl stdout \
                /cluster/data/fr1/Un/chrUn.lft warn stdin | \
        pslSortAcc nohead chrom temp stdin
        # 15 minutes ?
    rm -fr psl
    
    # load into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blatTetra/chrom
    mv chrUn.psl chrUn_blatTetra.psl
    hgLoadPsl -noTNameIx fr1 chrUn_blatTetra.psl

    # Add entry to trackDb.ra

    # make symlinks in /gbdb and load tetra sequence data
    cat * > ../tetra.masked.fa
    cd /gbdb/fr1
    ln -s /cluster/data/tetra/tetra.masked.fa 

    # load tetra sequences into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blatTetra
    hgLoadSeq fr1 /gbdb/fr1/tetra.masked.fa
       # Warning: load of seq did not go as planned: 108177 record(s), 1147 row(s) skipped, 13534 warning(s) loading .


# HUMAN HG16 BLAT SWAPPED FROM HUMAN BROWSER

    ssh eieio
    set hg16dir = /cluster/data/hg16/bed/blatFr1
    set fr1dir = /cluster/data/fr1/bed/blatHg16.swp
    mkdir -p $fr1dir/{psl,chrom}

    # swap alignments (for each human chrom)
    cd $hg16dir/chrom
    foreach f (chr?{,?}{,_random}_blatFr1.psl)
        pslSwap $f $fr1dir/psl/$f
    end 
    # merge to single Fugu chrUn chrom
    cd $fr1dir/psl
    pslCat chr?{,?}{,_random}_blatFr1.psl > \
                $fr1dir/chrom/scaffolds.psl 

    # lift target side (fugu) to chrom coordinates
    cd $fr1dir/chrom
    liftUp chrUn_blatHg16.psl /cluster/data/fr1/Un/chrUn.lft \
                                        warn scaffolds.psl
        # Got 40758 lifts
    
    # load into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blatHg16.swp/chrom
    hgLoadPsl -noTNameIx fr1 chrUn_blatHg16.psl

    # Add entry to trackDb.ra

    # make hg16 symlink in /gbdb and load human sequence data
    set gbdbdir=/gbdb/fr1/hg16
    mkdir $gbdbdir
    cd $gbdbdir
    foreach f (/cluster/data/hg16/?{,?})
        set c = $f:t
        if (-e $f/chr${c}.fa) then
            ln -s $f/chr${c}.fa
        endif
        if (-e $f/chr${c}_random.fa) then
            ln -s $f/chr${c}_random.fa
        endif
    end

    # load human sequences into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blatHg16.swp
    hgLoadSeq fr1 /gbdb/fr1/hg16/*.fa



# PRODUCE TETRAODON BLASTZ (6/13/03 IN PROGRESS KRR)
# NOTE: the tetra sequence wasn't repeatmasked -> must be redone

    cd eieio
    mkdir ~/fr1/bed/blastzTetra

    # translate the Genoscope fasta files to upper-case
    # so sequence doesn't look like it's masked out (all-lower)
    cd /iscratch/i/fish
    foreach f (*.fa)
        echo $f
        tr '[a-z]' '[A-Z]' < $f > /cluster/bluearc/tetra/$f
    end
    (cat README; echo "NOTE: Upper-cased files from /iscratch/i/fish") > \
                                        /cluster/bluearc/tetra/README

    # copy fugu files to cluster filesystem
    set clusterdir = /cluster/bluearc
    mkdir -p $clusterdir/fugu/fr1/rmsk
    cd $clusterdir/fugu/fr1/rmsk
    rm *.out
    cp ~/fr1/Un/chrUn.fa.out .

    # generate DEF file
    cd ~/fr1/bed/blastzTetra
    cat << '_EOF_' > DEF
# Tetraodon vs. Fugu
# TODO: angie's home dir out of this
export PATH=/usr/bin:/bin:/usr/local/bin:/cluster/home/angie/schwartzbin:/cluster/bin/i386

ALIGN=blastz-run
BLASTZ=blastz
BLASTZ_H=2000
#BLASTZ_ABRIDGE_REPEATS=1 if SMSK is specified
BLASTZ_ABRIDGE_REPEATS=0

# TARGET - Fugu
# soft-masked chrom nib
SEQ1_DIR=/cluster/bluearc/fugu/fr1/chromNib
# repeat masker output file for chrom
SEQ1_RMSK=/cluster/bluearc/fugu/fr1/rmsk
# lineage-specific repeats -- (repeats in fugu, not in tetraodon)
# we don't have that information for these species
SEQ1_SMSK=/dev/null
# currently unused
SEQ1_FLAG=-fish
SEQ1_IN_CONTIGS=0
# 10 MB chunk for target
SEQ1_CHUNK=10000000
SEQ1_LAP=10000

# QUERY - Tetraodon
SEQ2_DIR=/cluster/bluearc/tetra
# no repeatmasker output for these sequences
SEQ2_RMSK=/dev/null
SEQ2_SMSK=/dev/null
# currently unused
SEQ2_FLAG=-fish
# set to 1 if multiple contigs are in 1 fasta file
SEQ2_IN_CONTIGS=1
# 10 Mbase for query
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra

DEF=$BASE/DEF
RAW=$BASE/raw
CDBDIR=$BASE
SEQ1_LEN=$BASE/S1.len
SEQ2_LEN=$BASE/S2.len

#DEBUG=1
'_EOF_' 
    # << happy emacs

    # save the DEF file in the current standard place
    chmod +x DEF
    cp DEF ~angie/hummus/DEF.fr1-tetra.2003-06-17

    # setup cluster run
    ssh kk
    cd ~/fr1/bed/blastzTetra

    # source the DEF file
    bash
    . ./DEF

    # follow the next set of directions slavishly
    mkdir -p $BASE/run
    # give up on avoiding angie's directories
    # tcl script
    # creates xdir.sh and joblist run/j
    ~angie/hummus/make-joblist $DEF > $BASE/run/j
        # Computing genome sizes
        # Writing mkdir script: /cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra/xdir.sh
        # Writing job list

    # xdir.sh makes a bunch of result directories in $BASE/raw/
    # based on chrom name and CHUNK size
    sh $BASE/xdir.sh
    cd $BASE/run

    # now edit j to prefix path to executable name
    # NOTE: we should have a controlled version of schwartz bin executables
    sed -e 's#^#/cluster/home/schwartz/bin/#' j > j2
    wc -l j*
    head j2

    # *** make sure the j2 edits are OK, then use it:
    mv j2 j

    # para create will create the file: 'batch' for the cluster run
    para create j
        # Checking input files
        # 1400 jobs written to batch
    para try
    para check
    para push
    # ... etc ...
    # about 3 hrs. per job

    # post-process blastz
    # normalize
    ssh kk
    cd ~/fr1/bed/blastzTetra
        #   source the DEF file again in case you are coming back to this
    bash
    . ./DEF

    # a new run directory
    mkdir -p $BASE/run.1

    # another obscure script creates a new job list:
    ~angie/hummus/do.out2lav $DEF >$BASE/run.1/j
    cd $BASE/run.1

    # add path to executable in the job list:
    sed -e 's/^/\/cluster\/home\/schwartz\/bin\//' j > j2
    wc -l j*
    head j2

    # make sure the edited j2 is OK, then use it:
    mv j2 j
    para create j
    para try
    para check
    para push
    # etc.
        # 15 minute jobs

    # Translate the .lav files
    # into axt files
    ssh kkstore
    set base="/cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra"
    set tbl="blastzTetra"

    # generate single tetra fa file required for lavToAxt
    cd /cluster/bluearc
    cat tetra/* > tetra-all.fa

    # copy lav files to bluearc to speed up job
    ssh eieio
    mkdir -p /cluster/bluearc/fugu/fr1/blastzTetra/lav
    cp ~/fr1/bed/blastzTetra/lav/chrUn/* /cluster/bluearc/fugu/fr1/blastzTetra/lav
    
    # cluster job for converting lav files
    # these take a long time because query is in fa files
    # (with 880K sequences), not a chrom nib
    ssh kk
    cd ~/fr1/bed/blastzTetra
    mkdir -p axtLav
    mkdir run.lavToAxt
    cd run.lavToAxt
    
    # create file lists
    ls -1S /cluster/bluearc/fugu/fr1/blastzTetra/lav/*.lav > lav.lst
    echo "" > dummy.lst

    # Create template file, gsub, for gensub2. 
    cat << 'EOF' > gsub
#LOOP 
/cluster/bin/i386/lavToAxt {check in line+ $(path1)} /cluster/bluearc/fugu/fr1/chromNib -fa /cluster/bluearc/tetra-all.fa {check out line+ /cluster/store5/Fugu_Rubripes_V3/bed/blastzTetra/axtLav/$(root1).axt}
#ENDLOOP
'EOF'

    gensub2 lav.lst dummy.lst gsub jobList
    para create jobList
    para try 
    para check
    para push

    # 35 hours (too long -- need to split up better).

    cd $base
    mkdir -p axtChrom
    cat axtLav/*.axt | /cluster/bin/i386/axtSort stdin axtChrom/chrUn.axt
    
    #foreach c (lav/*)
      #pushd $c
      #set chr=$c:t
      #set out=$base/axtChrom/$chr.axt
      #echo "out=$out"
      #echo "Translating $chr lav to $out"
      #echo cat `ls -1 *.lav | sort -g` \
        #| /cluster/bin/i386/lavToAxt stdin $fugudir/nib \
                #-fa /cluster/bluearc/tetra-all.fa stdout \
        #| /cluster/bin/i386/axtSort stdin $out
      #popd
    #end

    # Translate the sorted axt files into psl:
    cd $base
    mkdir -p pslChrom
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # load these blastz results
    ssh hgwdev
    cd ~/fr1/bed/blastzTetra/pslChrom
    /cluster/bin/i386/hgLoadPsl -noTNameIx fr1 chr*_*.psl

    # STOPPED HERE -- due to empty output data


# PRODUCING GENSCAN PREDICTIONS (6/17/03 KRR)
#       Note: Genscan input files should be hard-masked
    
    ssh eieio

    # hard mask the super-scaffolds
    mkdir -p ~/fr1/superscaffolds.hardmasked
    cd ~/fr1/superscaffolds
    foreach f (ss_*.fa)
        echo $
        maskOutFa $f hard ../superscaffolds.hardmasked/$f
    end

    # setup the genscan directory
    mkdir -p ~/fr1/bed/genscan
    cd ~/fr1/bed/genscan

    # Make 3 subdirectories for genscan to put output files in
    mkdir -p gtf pep subopt

    # Generate a list file, genome.list, of all the contigs
    # *that do not have pure Ns* (due to heterochromatin, unsequenceable 
    # stuff) which would cause genscan to run forever.
    rm -f genome.list
    touch genome.list
    foreach f ( `ls -1S /cluster/store5/Fugu_Rubripes_V3/superscaffolds.hardmasked/ss_*.fa` )
      egrep '[ACGT]' $f > /dev/null
      if ($status == 0) echo $f >> genome.list
    end

    # Log into kkr1u00 (not kk!).  kkr1u00 is the driver node for the small
    # cluster (kkr2u00 -kkr8u00. Genscan has problem running on the
    # big cluster, due to limitation of memory and swap space on each
    # processing node).
    ssh kkr1u00
    cd ~/fr1/bed/genscan

    # Create template file, gsub, for gensub2. 
    # NOTE: According to the README for Genscan, 
    # the HumanIso.smat parameter files is used for all vertebrates
    cat << '_EOF_' > gsub
#LOOP 
/cluster/home/kent/bin/i386/gsBig {check in line+ $(path1)} {check out line gtf/$(root1).gtf} -trans={check out line pep/$(root1).pep} -subopt={check out line subopt/$(root1).bed} -exe=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/genscan -par=/cluster/home/fanhsu/projects/compbio/bin/genscan-linux/HumanIso.smat -tmp=/tmp -window=2400000
#ENDLOOP
'_EOF_' 
    # << happy emacs
    echo "" > dummy.list
    gensub2 genome.list dummy.list gsub jobList
    para create jobList
    para try
    para check
    para push

    # Convert output to chrom coordinates
    ssh eieio
    cd ~/fr1/bed/genscan
    liftUp genscan.gtf ~/fr1/Un/chrUn.lft warn gtf/*.gtf
    liftUp genscanSubopt.bed ~/fr1/Un/chrUn.lft warn subopt/*.bed > /dev/null
    cat pep/*.pep > genscan.pep

    # Load into the database
    ssh hgwdev
    cd ~/fr1/bed/genscan
    ldHgGene fr1 genscan genscan.gtf
        # Reading genscan.gtf
        # Read 36106 transcripts in 243984 lines in 1 files
        # 36106 groups 1 seqs 1 sources 1 feature types
        # 36106 gene predictions
    hgPepPred fr1 generic genscanPep genscan.pep
    hgLoadBed fr1 genscanSubopt genscanSubopt.bed > /dev/null


# CPGISLANDS (6/17/03 KRR)
    ssh eieio
    mkdir -p /cluster/data/fr1/bed/cpgIsland
    cd /cluster/data/fr1/bed/cpgIsland

    # Build software emailed from Asif Chinwalla (achinwal@watson.wustl.edu)
    # NOTE: we should keep this centrally
    cp -rp /cluster/data/mm3/bed/cpgIsland/cpg_dist .
    cd cpg_dist
    # gcc readseq.c cpg_lh.c -o cpglh.exe

    # hard-mask the chrom fa 
    # cpglh.exe requires hard-masked (N) .fa's.  
    # There may be warnings about "bad character" for IUPAC ambiguous 
    # characters like R, S, etc.  Ignore the warnings.  
    cd ~/fr1/Un
    maskOutFa chrUn.fa hard chrUn.hardmasked.fa

    # run cpgislands on the hard-masked chrom fa
    cd ~/fr1/bed/cpgIsland
    ./cpglh.exe ~/fr1/Un/chrUn.hardmasked.fa > chrUn.cpg

    cp ~kent/mm3/bed/cpgIsland/filter.awk .
    awk -f filter.awk chr*.cpg > cpgIsland.bed

    # load annotations into database
    ssh hgwdev
    cd ~/fr1/bed/cpgIsland
    # NOTE: this should go into a shared lib dir
    cp $HOME/kent/src/hg/lib/cpgIsland.sql .
    hgLoadBed fr1 cpgIsland -tab -noBin \
      -sqlTable=cpgIsland.sql cpgIsland.bed
        # Reading cpgIsland.bed
        # Loaded 41723 elements
        # Sorted
        # Saving bed.tab
        # Loading fr1

# LOAD ENSEMBL GENE PREDICTIONS (DONE 2003-08-11 kate)
# The downloads were provided by Shawn Hoon (shawnh@fugu-sg.org)
# of the Singapore Institute of Molecular and Cell Biology (IMCB)
# Fugu genome project, part of the international Fugu Genome Consortium
# Other contact is: Venkatesh Byrappa (mcbbv@imcb.a-star.edu.sg)
# In future releases, these annotations will need to be downloaded from Ensembl,
# using Ensmart

    ssh hgwdev
    cd /cluster/data/fr1
    mkdir -p bed/ensembl
    cd bed/ensembl

    # download and load gene predictions
    wget http://www.fugu-sg.org/~shawnh/fugu_download/fugu_rubripes_15_2_gtf.tar.gz
    tar xvfz fugu_rubripes_15_2_gtf.tar.gz
    cd fugu_rubripes_core_15_2
    foreach f (*.gtf)
        echo $f
        sed 's/^Chr_scaffold/scaffold/' $f >> ../ensGene.scaffold.gtf
    end
    cd ..
    /cluster/bin/i386/liftUp ensGene.gtf /cluster/data/fr1/Un/chrUn.lft warn ensGene.scaffold.gtf
    /cluster/bin/i386/ldHgGene fr1 ensGene ensGene.gtf
      # 38510 gene predictions

    # download and load predicted peptides
    #wget http://www.fugu-sg.org/~shawnh/fugu_download/Fugu_rubripes.FUGU2.pep.fa.gz
    #gunzip Fugu_rubripes.FUGU2.pep.fa.gz

    sed 's/Translation:SINFRUP/SINFRUT/g' < Fugu_rubripes.FUGU2.pep.fa > ensembl.pep
    hgPepPred fr1 generic ensPep ensembl.pep
#

# MAKE DOWNLOADABLE SEQUENCE FILES (IN PROGRESS 2003-08-29 kate)
    ssh eieio
    cd /cluster/data/fr1

    # Build the .zip files
    mkdir jkStuff
    cp /cluster/data/hg15/jkStuff/zipAll.sh jkStuff

    # Edit jkStuff/zipAll.sh to point to the correct paths.

    ./jkStuff/zipAll.sh >&! zipAll.log
    tail -f zipAll.log

    # Look at zipAll.log to make sure all file lists look reasonable.  
    # Check zip file integrity:

    mkdir zip
    mv *.zip* zip
    cd zip
    foreach f (*.zip)
      unzip -t $f > $f.test
      tail -1 $f.test
    end
    wc -l *.zip.test
      # 3 chromAgp.zip.test
      # 3 chromFa.zip.test
      # 3 chromFaMasked.zip.test
      # 3 chromOut.zip.test
      # 3 chromTrf.zip.test
      # 580 contigFa.zip.test
      # 580 contigFaMasked.zip.test
      # 580 contigTrf.zip.test
      # 3 liftAll.zip.test
      # 3 mrna.zip.test

    # Copy the .zip files to hgwdev:/usr/local/apache/...  
    ssh hgwdev
    cd /cluster/data/fr1/zip
    set webDir = /usr/local/apache/htdocs/goldenPath/fr1
    mkdir -p $webDir/bigZips
    cp -p *.zip $webDir/bigZips

    # Create rmsk database table dump for early users (Victor Solovyev)
    cd $webDir
    chmod o+w .
    mysqldump --user=SECRET --password=SECRET --all --tab=. \
                                                        fr1 chrUn_rmsk
    # change ownership (was mysql)
    mkdir database
    cp chrUn_rmsk.* database
    rm chrUn_rmsk.*

    # restore dir permission
    chmod o-w .
    unset webDir

    # No RefSeqs for Fugu, so no upstream files.

    # create checksum file
    md5sum *.zip > md5sum.txt

    # Take a look at bigZips/* update the README.txt


# move fr1 to incremental genbank/refseq updates:
    # need to create a lift file to splice chrUn into fragments larger
    # than the scaffolds or there will be far to many jobs
    cd /cluster/data/fr1/Un
    ./makeFragLift chrUn.lft >chrUn-frag.lft

    cd /cluster/data/genbank/
    # edit etc/genbank.conf to add fr1

    # align mrnas, then ests
    nice ./bin/gbAlignStep -initial -type=mrna -clusterRootDir=/cluster/bluearc/genbank -iserver=no fr1 
    nice ./bin/gbAlignStep -initial -type=est -clusterRootDir=/cluster/bluearc/genbank -iserver=no fr1 

# TBA PROJECTION - NON-MAMMAL SPECIES INCLUDED (Fugu & Chicken)  2003-10-20 kate
#       Note: CFTR region in fugu is spread across 4 non-consecutive scaffolds
# These are hg16/rn3/mm3 plus NISC sequences.  See hg16 doc for details.
    ssh hgwdev
    mkdir -p  /cluster/data/fr1/bed/nisc/cftr
    set table = tbaFishBirdCFTR
    mkdir -p /gbdb/fr1/$table
    cd /gbdb/fr1/$table
    foreach i (1 2 3 4)
        ln -s /cluster/data/nisc/targets/cftr/CFTR.non-mammal/fugu$i.out \
                /cluster/data/fr1/bed/nisc/cftr/tbaFishBird.$i.maf
        ln -s /cluster/data/fr1/bed/nisc/cftr/tbaFishBird.$i.maf \
                /gbdb/fr1/${table}/tba.$i.maf
    end
    cd /cluster/data/fr1/bed/nisc/cftr
    /cluster/bin/i386/hgLoadMaf -WARN fr1 $table
        # 60 warnings
        
    
# ECORES FROM GENOSCOPE [in progress, hartera, 2004-03-29]
# download data from  http://www.genoscope.cns.fr/externe/tetraodon/Data3/ecores

# ecotigFH - ecores on Fugu, genome conserved with Human, hg16
# ecotigFM - ecores on Fugu, genome conserved with Mouse, mm3
# ecotigFR - ecores on Fugu, genome conserved with Rat, rn3

    ssh hgwdev
    mkdir /cluster/data/fr1/bed/ecores/
    # add parse_ecotig.pl to this directory

# MOUSE

    mkdir /cluster/data/fr1/bed/ecores/mm3
    cd /cluster/data/fr1/bed/ecores/mm3/

    # download data for ecotigFH to this directory 
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigFM > ecotigFM.bed
                                                                                
    # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD"
    perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFM.bed
                                                                                
    # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates
    liftUp -type=.bed ecotigFMlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFM.bed
    # only converts the chromStart and chromEnd fields, use fixLift.pl to 
    # convert rest of the bed file

    perl ../fixLift.pl < ecotigFMlift.bed > ecotigFMliftFixed.bed                                                                       
    hgLoadBed -tab fr1 ecoresMm3 ecotigFMliftFixed.bed
    # clean up
    rm *.bak

# RAT
                                                                                
    mkdir /cluster/data/fr1/bed/ecores/rn3
    cd /cluster/data/fr1/bed/ecores/rn3/
                                                                                
    # download data for ecotigFH to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigFR > ecotigFR.bed
                                                                                
    # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD"
    perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFR.bed
                                                                                
    # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates
    liftUp -type=.bed ecotigFRlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFR.bed
    # only converts the chromStart and chromEnd fields, use fixLift.pl to 
    # convert rest of the bed file

    perl ../fixLift.pl < ecotigFRlift.bed > ecotigFRliftFixed.bed                                                                       
    hgLoadBed -tab fr1 ecoresRn3 ecotigFRliftFixed.bed
    
    # clean up
    rm *.bak

# HUMAN
    mkdir /cluster/data/fr1/bed/ecores/hg16
    cd /cluster/data/fr1/bed/ecores/hg16/

    # download data for ecotigFH to this directory
    # parse ecotig files to produce a bed format file
    perl ../parse_ecotig.pl < ecotigFH > ecotigFH.bed
                                                                                
    # remove "CONTIG:" and change from upper to lower case for "SCAFFOLD"
    perl -pi.bak -e 's/CONTIG://g; s/SCAFFOLD/scaffold/g;' ecotigFH.bed
   
    # use liftUp to convert scaffold co-ordinates to chrUn co-ordinates
    liftUp -type=.bed ecotigFHlift.bed /cluster/data/fr1/Un/chrUn.lft warn ecotigFH.bed
    # only converts the chromStart and chromEnd fields, use fixLift.pl to 
    # convert rest of the bed file

    perl ../fixLift.pl < ecotigFHlift.bed > ecotigFHliftFixed.bed                                                                       
    hgLoadBed -tab fr1 ecoresHg16 ecotigFHliftFixed.bed
    # clean up
    rm *.bak


    # add entries in kent/src/hg/makeDb/trackDb/fugu/trackDb.ra
    # add html for details pages to this directory:
    # ecoresMm3.html, ecoresRn3.html and ecoresHg16.html

# SWAP ZEBRAFISH-FUGU BLASTZ TO FUGU-ZEBRAFISH (danRer1) 
# (DONE, 2004-06-16, hartera)
    ssh kolossus
    mkdir /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16
    cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16
    set aliDir = /cluster/data/danRer1/bed/blastz.fr1
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
# 1.6G    /cluster/data/danRer1/bed/blastz.fr1/axtChrom
# 1.6G    unsorted
# 1.6G    axtChrom
    rm -r unsorted

    # translate sorted axt files into psl
    ssh kolossus
    cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16
    mkdir -p pslChrom
    set tbl = "blastzDanRer1"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end

    # Load database table
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16/pslChrom
                                                                                
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl fr1 $f
      echo "$f Done"
    end

# MAKE Human Proteins track
    ssh eieio
    mkdir -p /cluster/data/fr1/blastDb
    cd /cluster/data/fr1/blastDb
    for i in ../superscaffolds/*.fa; do ln -s $i .; done
    for i in *.fa; do formatdb -i $i -p F 2> /dev/null; done
    rm *.fa *.log

    ssh kkr1u00
    mkdir -p /iscratch/i/fr1/blastDb
    cp /cluster/data/fr1/blastDb/* /iscratch/i/fr1/blastDb
    (~kent/bin/iSync) 2>&1 > sync.out
    
    mkdir -p /cluster/data/fr1/bed/tblastn.hg16KG
    cd /cluster/data/fr1/bed/tblastn.hg16KG
    ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > fugu.lst
    exit

    # back to kksilo
    cd /cluster/data/fr1/bed/tblastn.hg16KG
    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl | awk "{print \\\$1}"`/\(150000/`wc fugu.lst | awk "{print \\\$1}"`\)
    # 41117/(150000/578) = 158.437507
    split -l 158 /cluster/data/hg16/bed/blat.hg16KG/hg16KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    # << happy emacs
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg16/bed/blat.hg16KG/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
    # << happy emacs

    chmod +x blastSome
    gensub2 fugu.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/fr1/bed/tblastn.hg16KG
    para create blastSpec
    para shove   # jobs will need to be restarted, but they should all finish
# Completed: 150858 of 150858 jobs
# CPU time in finished jobs:   10404860s  173414.33m  2890.24h  120.43d  0.330 y
# IO & Wait Time:               2468643s   41144.06m   685.73h   28.57d  0.078 y
# Average job time:                  85s       1.42m     0.02h    0.00d
# Longest job:                      411s       6.85m     0.11h    0.00d
# Submission to last job:         26364s     439.40m     7.32h    0.31d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=10000 stdin ../c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/fr1/bed/tblastn.hg16KG
    para create chainSpec
    para push
# Completed: 261 of 261 jobs
# CPU time in finished jobs:        828s      13.81m     0.23h    0.01d  0.000 y
# IO & Wait Time:                 19521s     325.34m     5.42h    0.23d  0.001 y
# Average job time:                  78s       1.30m     0.02h    0.00d
# Longest job:                      270s       4.50m     0.07h    0.00d
# Submission to last job:          3878s      64.63m     1.08h    0.04d

    exit
    # back to kksilo
    cd /cluster/data/fr1/bed/tblastn.hg16KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/fr1/jkStuff/liftAll.lft warn stdin |  \
    	sort -T /tmp -k 14,14 -k 16,16n -k 17,17n  | uniq  > ../preblastHg16KG.psl
    cd ..
    blatDir=/cluster/data/hg16/bed/blat.hg16KG
    protDat -kg preblastHg16KG.psl $blatDir/hg16KG.psl $blatDir/kg.mapNames blastHg16KG.psl

    ssh hgwdev
    cd /cluster/data/fr1/bed/tblastn.hg16KG
    hgLoadPsl fr1 blastHg16KG.psl
    exit

    # back to kksilo
    rm -rf blastOut

# End tblastn

# BLASTZ SWAP FOR danRer2 vs. fr1 BLASTZ TO CREATE fr1 vs. danRer2
# (DONE, 2004-12-15, hartera)
    ssh kolossus
    mkdir -p /cluster/data/fr1/bed/blastz.danRer2.swap
    cd /cluster/data/fr1/bed/blastz.danRer2.swap
    # use axtChrom from blastzFr1 on danRer2
    set aliDir = /cluster/data/danRer2/bed/blastz.fr1
    cp $aliDir/S1.len S2.len
    cp $aliDir/S2.len S1.len
    mkdir unsorted axtChrom
    cat $aliDir/axtChrom/chr*.axt \
    | axtSwap stdin $aliDir/S1.len $aliDir/S2.len stdout \
    | axtSplitByTarget stdin unsorted
    # Sort the shuffled .axt files.
    foreach f (unsorted/*.axt)
      echo sorting $f:t:r
      axtSort $f axtChrom/$f:t
    end
    du -sh $aliDir/axtChrom unsorted axtChrom
    # 867M    /cluster/data/danRer2/bed/blastz.fr1/axtChrom
    # 868M    unsorted
    # 868M    axtChrom

    rm -r unsorted
    # translate sorted axt files into psl
    cd /cluster/data/fr1/bed/blastz.danRer2.swap
    mkdir -p pslChrom
    set tbl = "blastzDanRer2"
    foreach f (axtChrom/chr*.axt)
      set c=$f:t:r
      echo "Processing chr $c"
      /cluster/bin/i386/axtToPsl $f S1.len S2.len pslChrom/${c}_${tbl}.psl
    end
    # Load database tables
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/pslChrom
    foreach f (./*.psl)
      /cluster/bin/i386/hgLoadPsl fr1 $f
      echo "$f Done"
    end
# featureBits fr1 all_mrna chrUn_blastzDanRer2 -enrichment
# all_mrna 0.123%, chrUn_blastzDanRer2 17.731%, both 0.092%, cover 74.45%, 
# enrich 4.20x
# featureBits fr1 all_mrna chrUn_blastzDanRer1 -enrichment
# all_mrna 0.123%, chrUn_blastzDanRer1 17.372%, both 0.091%, cover 74.21%, 
# enrich 4.27x

# CHAIN ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-15, hartera)
# APPLY chainAntiRepeat TO REMOVE CHAINS THAT ARE THE RESULTS OF REPEATS
# AND DEGENERATE DNA (DONE, 2004-12-23, hartera)
    # Run axtChain on little cluster
    ssh kki
    cd /cluster/data/fr1/bed/blastz.danRer2.swap
    mkdir -p axtChain/run1
    cd axtChain/run1
    mkdir out chain
    ls -1S /cluster/data/fr1/bed/blastz.danRer2.swap/axtChrom/*.axt \
        > input.lst
 cat << '_EOF_' > gsub
#LOOP
doChain {check in exists $(path1)} {check out line+ chain/$(root1).chain} {check out line+ out/$(root1).out}
#ENDLOOP
'_EOF_'
    # << happy emacs
                                                                                
    cat << '_EOF_' > doChain
#!/bin/csh
axtChain $1 /cluster/bluearc/fugu/fr1/chromNib \
           /iscratch/i/danRer2/nib $2 >& $3
'_EOF_'
    # << happy emacs
                                                                                
    chmod a+x doChain
    gensub2 input.lst single gsub jobList
    para create jobList
    para try, check ...
# para time
# CPU time in finished jobs:        158s       2.63m     0.04h    0.00d  0.000 y
# IO & Wait Time:                    11s       0.19m     0.00h    0.00d  0.000 y
# Average job time:                 169s       2.82m     0.05h    0.00d
# Longest job:                      169s       2.82m     0.05h    0.00d
# Submission to last job:           169s       2.82m     0.05h    0.00d
    # now on the cluster server, sort chains
    ssh kksilo
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    chainMergeSort run1/chain/*.chain > all.chain
    chainSplit chain all.chain
    # take a look at score distr's
    foreach f (chain/*.chain)
      grep chain $f | awk '{print $2;}' | sort -nr > /tmp/score.$f:t:r
      echo $f:t:r >> hist5000.out
      textHistogram -binSize=5000 /tmp/score.$f:t:r >> hist5000.out
      echo ""
    end
    # filter with minScore=5000
    mv all.chain all.chain.unfiltered
    chainFilter -minScore=5000 all.chain.unfiltered > all.chainFilt5k
    rm -r chain
    chainSplit chain all.chainFilt5k

    # remove repeats from chains and reload into database
    # (2004-12-23, hartera)
    ssh kksilo
    /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    gunzip chain/chr*.gz
    mv chain chainRaw
    mkdir chain
    cd chainRaw
    foreach f (*.chain)
       set c = $f:r
       echo $c
       nice chainAntiRepeat /cluster/bluearc/fugu/fr1/chromNib \
                       /cluster/bluearc/danRer2/nib $f \
                       ../chain/$c.chain
    end
    cd ..
    chainMergeSort ./chain/*.chain > all.chain.antirepeat
    chainSplit chainAR all.chain.antirepeat

    # Load chains into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain/chainAR
    foreach i (*.chain)
        set c = $i:r
        hgLoadChain fr1 ${c}_chainDanRer2 $i
        echo done $c
    end

# featureBits fr1 all_mrna chrUn_chainDanRer2Link -enrichment
# all_mrna 0.123%, chrUn_chainDanRer2Link 15.517%, both 0.089%, cover 72.00%, 
# enrich 4.64x
# after chainAntiRepeat:
# featureBits fr1 all_mrna chrUn_chainDanRer2Link -enrichment
# all_mrna 0.126%, chrUn_chainDanRer2Link 15.505%, both 0.091%, cover 72.03%, 
# enrich 4.65x

# NET ZEBRAFISH (danRer2) BLASTZ (DONE, 2004-12-15, hartera)
# RE-DO NET WITH CHAINS FILTERED BY chainAntiRepeat (DONE, 2004-12-23,hartera)
    ssh kksilo
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    rm -r preNet
    mkdir preNet
    cd chainAR
    foreach i (*.chain)
       echo preNetting $i
       /cluster/bin/i386/chainPreNet $i ../../S1.len ../../S2.len \
                                     ../preNet/$i
    end
                                                                                
    cd ..
    mkdir n1
    cd preNet
    foreach i (*.chain)
      set n = $i:r.net
      echo primary netting $i
      /cluster/bin/i386/chainNet $i -minSpace=1 ../../S1.len ../../S2.len \
                                 ../n1/$n /dev/null
    end
    cd ..
    cat n1/*.net | /cluster/bin/i386/netSyntenic stdin noClass.net
    # memory usage 77627392, utime 389 s/100, stime 103

    # Add classification info using db tables:
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    # use -noAr option - don't look for ancient repeats
    netClass -noAr noClass.net fr1 danRer2 zfishdanRer2.net
                                                                                
    # Load the nets into database
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    netFilter -minGap=10 zfishdanRer2.net | hgLoadNet fr1 netDanRer2 stdin
# featureBits fr1 all_mrna netDanRer2 -enrichment
# all_mrna 0.123%, netDanRer2 31.390%, both 0.096%, cover 77.94%, enrich 2.48x
# featureBits fr1 all_mrna netDanRer2 -enrichment
# all_mrna 0.126%, netDanRer2 31.379%, both 0.098%, cover 77.94%, enrich 2.48x

# BLASTZ FR1 CLEANUP (DONE, 2004-12-15, hartera)
# RE-DONE (DONE, 2004-12-23, hartera)
    ssh kksilo
    cd /cluster/data/fr1/bed/blastz.danRer2.swap
    nice rm axtChain/run1/chain/* &
    nice rm -fr axtChain/n1 axtChain/noClass.net axtChain/preNet &
    nice gzip axtChrom/* pslChrom/* axtChain/all.chainFilt5k axtChain/all.chain.unfiltered axtChain/*.net axtChain/chain/*.chain &
    nice gzip axtChain/all.chain.antirepeat axtChain/chainAR/*.chain &
    nice rm -fr axtChain/chain axtChain/chainRaw &
    
# MAKE VSDANRER2 DOWNLOADABLES (DONE, 2004-12-23, hartera)
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChrom
    set gp = /usr/local/apache/htdocs/goldenPath/fr1
    mkdir -p $gp/vsDanRer2/axtChrom
    gunzip chrUn.axt.gz
    cp -p *.axt $gp/vsDanRer2/axtChrom
    cd $gp/vsDanRer2/axtChrom
    gzip *.axt
    md5sum *.gz > md5sum.txt

    # copy chains and nets to downloads area
    cd /cluster/data/fr1/bed/blastz.danRer2.swap/axtChain
    gzip -c all.chain.antirepeat > \
            /cluster/data/fr1/zip/zebrafishDanRer2.chain.gz
    gzip -c zfishdanRer2.net > /cluster/data/fr1/zip/zebrafishDanRer2.net.gz
    cd $gp/vsDanRer2
    mv /cluster/data/fr1/zip/zebrafish*.gz .
    md5sum *.gz > md5sum.txt
    # Copy over & edit README.txt w/pointers to chain, net formats.

# MAKE Human Proteins track (hg17 DONE 
    ssh eieio
    mkdir -p /cluster/data/fr1/blastDb
    cd /cluster/data/fr1/blastDb
    ln -s ../superscaffolds/*.fa .
    for i in ss*
    do
	formatdb -i $i -p F
    done
    rm *.log *.fa

    ssh kkr1u00
    mkdir -p /iscratch/i/fr1/blastDb
    cp /cluster/data/fr1/blastDb/* /iscratch/i/fr1/blastDb
    (iSync) > sync.out
    exit # back to eieio

    ssh kk
    mkdir -p /cluster/data/fr1/bed/tblastn.hg17KG
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > query.lst

    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(200000/`wc query.lst | awk "{print \\\$1}"`\)
    # 42156/(200000/578) = 121.830840
    mkdir -p /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa
    split -l 122 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa/kg
    ln -s /cluster/bluearc/fr1/bed/tblastn.hg17KG/kgfa kgfa
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    mkdir blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    tcsh
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    # << happy emacs
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
	if pslCheck -prot $3.tmp
	then
	    mv $3.tmp $3
	    rm -f $f.1 $f.2 
	fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
    # << happy emacs

    chmod +x blastSome
    gensub2 query.lst kg.lst blastGsub blastSpec

    cd /cluster/data/fr1/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 199988 of 199988 jobs
# CPU time in finished jobs:   10762864s  179381.06m  2989.68h  124.57d  0.341 y
# IO & Wait Time:               2421413s   40356.89m   672.61h   28.03d  0.077 y
# Average job time:                  66s       1.10m     0.02h    0.00d
# Longest job:                      819s      13.65m     0.23h    0.01d
# Submission to last job:         36973s     616.22m    10.27h    0.43d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin ../c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 346 of 346 jobs
# CPU time in finished jobs:       2576s      42.94m     0.72h    0.03d  0.000 y
# IO & Wait Time:                 34240s     570.66m     9.51h    0.40d  0.001 y
# Average job time:                 106s       1.77m     0.03h    0.00d
# Longest job:                      379s       6.32m     0.11h    0.00d
# Submission to last job:          2654s      44.23m     0.74h    0.03d

    exit
    # back to eieio
    cd /cluster/data/fr1/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | sort -T /tmp -k 14,14 -k 17,17n -k 17,17n  | uniq | liftUp stdout ../../../Un/chrUn.lft warn stdin  > /cluster/data/fr1/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..
    liftUp Un/chrUn.fa.out Un/chrUn.lft warn RMRun/Un/*.out

    ssh hgwdev
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    hgLoadPsl fr1 blastHg17KG.psl

    # back to eieio
    rm -rf blastOut
# End tblastn

#########################################################################
# MOUSE NET/CHAINS MM6 - Info contained in makeMm6.doc (200503 Hiram)

# SPLIT MASKED chrUn SEQUENCE INTO SCAFFOLDS AND MAKE SCAFFOLDS 2BIT FILE
# (DONE, 2005-08-13, hartera)
# ADDED SOFT-MASKED SCAFFOLDS 2BIT FILE AND LIFT FILE AND chrUn 2BIT FILE
# TO SAN FOR CLUSTER RUNS AND CREATE FILE OF SCAFFOLD SIZES 
# (DONE, 2005-08-13 and 2006-04-28, hartera)
     ssh eieio
     cd /cluster/data/fr1/Un
     # for chrUn get masked sequence for soft-masked
     mkdir scaffoldsSoftMask
     mkdir -p /panasas/store/fr1/scaffolds
     set outDir=/panasas/store/fr1/scaffolds
     awk 'BEGIN {FS="\t"}{if ($5 != "N") \
 print "/cluster/data/fr1/Un/scaffoldsSoftMask/getMaskedFaFrags.csh /iscratch/i/fr1/chrUn.fa",$2-1, $3, "'$outDir/'"$6".fa", "'$outDir/'"$6".log";}' \
         chrUn.agp >> ./scaffoldsSoftMask/faFragSoftMask 
     # check number of scaffolds
     grep "scaffold" chrUn.agp | wc -l
     # 20379
     wc -l ./scaffoldsSoftMask/faFragSoftMask
     # 20379 ./scaffoldsSoftMask/faFragSoftMask
     # since there are so many, then use kilokluster
     # need to make a wrapper round faFrag as parasol can not
     # deal with redirects.
     cd scaffoldsSoftMask

cat << '_EOF_' > getMaskedFaFrags.csh
#!/bin/csh -fe
set $inFa=$1
set $st = $2
set $end = $3
set $outFa = $4
set $outLog = $5

faFrag -mixed $inFa $st $end $outFa >& $outLog
'_EOF_'
    # << happy emacs
     chmod +x getMaskedFaFrags.csh 
     ssh kkr1u00
     cp /cluster/data/fr1/Un/chrUn.fa /iscratch/i/fr1
     /cluster/bin/iSync 
     ssh kk
     cd /cluster/data/fr1/Un/scaffoldsSoftMask
     mkdir run 
     cd run 
     cp ../faFragSoftMask jobList
     para create jobList
     para try, check, push, check etc.
# para time
# Completed: 20379 of 20379 jobs
# CPU time in finished jobs:     518059s    8634.32m   143.91h    6.00d  0.016 y
# IO & Wait Time:               2831188s   47186.46m   786.44h   32.77d  0.090 y
# Average job time:                 164s       2.74m     0.05h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             469s       7.82m     0.13h    0.01d
# Submission to last job:         10106s     168.43m     2.81h    0.12d
 
     # check a few sequences to see that they are correct
     ssh kolossus
     cd /cluster/data/fr1/Un/scaffoldsSoftMask
     foreach f (/panasas/store/fr1/scaffolds/scaf*.fa)
        set g=$f:t:r  
        echo $g >> names
        perl -pi.bak -e "s/>chr[0-9A-Za-z\-\:]+/>$g/" $f
        cat $f >> /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa
     end
     grep ">" /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa | wc -l
     # 20379
     # then make 2bit file for blastz cluster runs
     faToTwoBit /panasas/store/fr1/scaffolds/scaffoldMaskedUnFr1.fa \
               /cluster/data/fr1/fr1UnScaffolds.2bit
     ssh kkr1u00
     mkdir -p /iscratch/i/fr1/UnScaffolds
     cp /cluster/data/fr1/fr1UnScaffolds.2bit /iscratch/i/fr1/UnScaffolds/
     /cluster/bin/iSync 
     # add to the san for cluster runs on pk 
     ssh hgwdev
     mkdir -p /san/sanvol1/scratch/fr1
     # already added to /san/sanvol1/scratch/fr1/UnScaffolds 
     # (2005-08-13, hiram)  - file is fr1UnScaffolds.2bit
     # also put lift file and chrUn 2 bit file on the san (2006-04-28, hartera)
     cp /cluster/data/fr1/Un/lift/ordered.lft \
        /san/sanvol1/scratch/fr1/UnScaffolds
     cp /cluster/data/fr1/fr1.2bit /san/sanvol1/scratch/fr1/
     # create scaffolds sizes file (2006-04-28, hartera)
     cd /san/sanvol1/scratch/fr1/UnScaffolds
     twoBitInfo fr1UnScaffolds.2bit scaffolds.sizes

# BLASTZ SWAP FOR FUGU (fr1) vs ZEBRAFISH (danRer3) (DONE, 2005-08-18, hartera)
# CREATE CHAIN AND NET TRACKS, AXTNET, MAFNET AND ALIGNMENT DOWNLOADS
# RECREATE DOWNLOADS AS THE ZEBRFISH DOWNLOADS DIRECTORY HAS BEEN DELETED
# (DONE, 2005-11-17, hartera)
     ssh hgwdev
     # do swap of danRer3 vs fr1 chain and net alignments to 
     # create fr1 vs danRer3. see makeDanRer3.doc for details.
     # there a no lineage-specific repeats for these species
     cd /cluster/data/danRer3/bed/blastz.fr1
     nohup nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
        -workhorse kkr1u00 -swap -chainMinScore=5000 >& doSwap.log &
     # Start: Aug 18 13:11
     # Finish: Aug 18 13:29
     # Blastz parameters are as for danRer3 vs. fr1 - see makeDanRer3.doc
# BLASTZ_H=2000
# BLASTZ_Q=/cluster/data/blastz/HoxD55.q
# BLASTZ_ABRIDGE_REPEATS=0
     # Run directory files are already on /cluster/data. Remake downloads
     # for zebrafish (danRer3) alignments since these have been removed from
     # the downloads directory. (hartera, 2005-11-17)
     cd /cluster/data/danRer3/bed/blastz.fr1
     nice /cluster/bin/scripts/doBlastzChainNet.pl `pwd`/DEF \
          -swap -continue download -stop download >& doSwapDownload.log &
      
# MAKE Human Proteins track (DONE 2005-06-27 braney)
    mkdir -p /cluster/data/fr1/bed/tblastn.hg17KG
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    ls -1S /iscratch/i/fr1/blastDb/*.nsq | sed "s/\.nsq//" > target.lst

    mkdir kgfa
    # calculate a reasonable number of jobs
    calc `wc /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl | awk "{print \\\$1}"`/\(150000/`wc target.lst | awk "{print \\\$1}"`\)
    # 37365/(150000/578) = 143.979800
    split -l 144 /cluster/data/hg17/bed/blat.hg17KG/hg17KG.psl kgfa/kg
    cd kgfa
    for i in *; do pslxToFa $i $i.fa; rm $i; done
    cd ..
    ls -1S kgfa/*.fa > kg.lst
    rm -rf  /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut
    mkdir -p /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut
    ln -s  /cluster/bluearc/fr1/bed/tblastn.hg17KG/blastOut
    for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
    cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl } 
#ENDLOOP
'_EOF_'
    # << happy emacs
    cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/iscratch/i/blast/data
export BLASTMAT
f=/tmp/`basename $3`
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /scratch/blast/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
if /cluster/bin/i386/blastToPsl $f.1 $f.2
then
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg17/bed/blat.hg17KG/protein.lft warn $f.2
        mv $3.tmp $3
        rm -f $f.1 $f.2 
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8
exit 1
'_EOF_'
    # << happy emacs

    chmod +x blastSome
    gensub2 target.lst kg.lst blastGsub blastSpec

    ssh kk
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    para create blastSpec
    para push
# Completed: 150280 of 150280 jobs
# CPU time in finished jobs:    4621712s   77028.53m  1283.81h   53.49d  0.147 y
# IO & Wait Time:               1350989s   22516.48m   375.27h   15.64d  0.043 y
# Average job time:                  40s       0.66m     0.01h    0.00d
# Longest finished job:             204s       3.40m     0.06h    0.00d
# Submission to last job:         72366s    1206.10m    20.10h    0.84d

    cat << '_EOF_' > chainGsub
#LOOP
chainSome $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainSome
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=25000 stdin ../c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainSome

    ls -1dS `pwd`/blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec

    ssh kki
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    para create chainSpec
    para push
# Completed: 260 of 260 jobs
# CPU time in finished jobs:        614s      10.24m     0.17h    0.01d  0.000 y
# IO & Wait Time:                 12566s     209.43m     3.49h    0.15d  0.000 y
# Average job time:                  51s       0.84m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:             156s       2.60m     0.04h    0.00d
# Submission to last job:           969s      16.15m     0.27h    0.01d

    ssh eieio
    cd /cluster/data/fr1/bed/tblastn.hg17KG/blastOut
    for i in kg??
    do 
	awk "(\$13 - \$12)/\$11 > 0.6 {print}" c.$i.psl > c60.$i.psl
	sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
	awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
	echo $i
    done

    cat u.*.psl m60* | liftUp -nosort -type=".psl" -nohead stdout /cluster/data/fr1/jkStuff/liftAll.lft warn stdin |  \
    	sort -u -T /tmp -k 14,14 -k 16,16n -k 17,17n  > /cluster/data/fr1/bed/tblastn.hg17KG/blastHg17KG.psl
    cd ..

    ssh hgwdev
    cd /cluster/data/fr1/bed/tblastn.hg17KG
    hgLoadPsl fr1 blastHg17KG.psl
    exit

    rm -rf blastOut

# End tblastn

# BLASTZ DANRER1 CLEANUP (DONE, 2006-02-07, hartera)
    ssh kkstore02
    cd /cluster/data/fr1/bed/blastz.danRer1.swap.2004-06-16
    rm pslChrom/psl.tab
    nice gzip axtChrom/* pslChrom/* &


############################################################################
##  BLASTZ swap from mm8 alignments (DONE - 2006-02-19 - Hiram)
    ssh pk
    cd /cluster/data/mm8/bed/blastzFr1.2006-02-19
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF > swap.out 2>&1 &
    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap -continue=net `pwd`/DEF > swap.net.out 2>&1 &

     time nice -n +19 featureBits fr1 chainMm8Link
    #   42671288 bases of 315518167 (13.524%) in intersection

############################################################################
##  BLASTZ swap from hg17 alignments (DONE 2006-04-09 markd)
    mkdir /cluster/data/fr1/bed/blastz.hg17.swap
    ln -s blastz.hg17.swap /cluster/data/fr1/bed/blastz.hg17
    cd /cluster/data/fr1/bed/blastz.hg17.swap
    
    #
    cp /cluster/data/hg17/bed/blastz.fr1/DEF .
    # edit to change /iscratch/i/hg17/bothMaskedNibs to /scratch/hg/hg17/bothMaskedNibs

    time /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 -stop=net \
	-swap -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	./DEF >& swap.out&

   # create the net filee (DONE 2006-04-10 markd)
    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.hg17.swap/axtChain
    nice netClass -verbose=0 -noAr noClass.net fr1 hg17 fr1.hg17.net
    nice gzip fr1.hg17.net
###########################################################################
# BLASTZ SWAP TO CREATE CHAIN AND NET ALIGNMENTS FOR ZEBRAFISH (danRer4) 
# AND CREATE AXNET, MAFNET, LIFTOVER AND ALIGNMENT DOWNLOADS 
# (DONE, 2006-04-29, hartera)
    ssh pk
    # Used these BLASTZ parameters - see also makeDanRer4.doc
    # BLASTZ_H=2000
    # BLASTZ_M=50
    # BLASTZ_Q=/cluster/data/blastz/HoxD55.q 
    # and no lineage-specific repeats.
    # Alignments are in: /cluster/data/fr1/bed/blastz.danRer4.swap

    cd /cluster/data/danRer4/bed/blastz.fr1.2006-04-28
    nice /cluster/bin/scripts/doBlastzChainNet.pl -verbose=2 \
        -bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
        -swap `pwd`/DEF >& doSwap.log &
    # Took about 30 minutes.

   # check coverage compared to danRer3:
   featureBits fr1 chainDanRer4Link
   # 86236583 bases of 315518167 (27.332%) in intersection  
   # There is no refGene table for fr1 so use mrna:
   featureBits fr1 mrna chainDanRer4Link -enrichment
   # mrna 0.187%, chainDanRer4Link 27.332%, both 0.153%, cover 82.08%, 
   # enrich 3.00x
   featureBits fr1 mrna netDanRer4 -enrichment
   # mrna 0.187%, netDanRer4 69.291%, both 0.172%, cover 92.16%, enrich 1.33x
   featureBits fr1 mrna netDanRer3 -enrichment
   # mrna 0.187%, netDanRer3 68.354%, both 0.171%, cover 91.52%, enrich 1.34x
   # Runs out of memory for danRer3 chains. For nets, similar coverage for
   # danRer4 and danRer3 nets on fr1.


#####################################################################
# SEGMENTAL DUPLICATIONS (DONE 6/30/06 angie)
    # File emailed from Xinwei She <xws@u.washington.edu>
    mkdir /cluster/data/fr1/bed/genomicSuperDups
    cd /cluster/data/fr1/bed/genomicSuperDups
    liftUp -type=.bed stdout /cluster/data/fr1/jkStuff/liftAll.lft warn \
      fr1_genomicSuperDup.tab \
    | sed -e 's/\t_\t/\t-\t/' \
    | awk '($3 - $2) >= 500 && ($9 - $8) >= 500 {print;}' \
    | hgLoadBed fr1 genomicSuperDups stdin \
      -sqlTable=$HOME/kent/src/hg/lib/genomicSuperDups.sql

#########################################################################
##  Reorder Fish organisms (DONE - 2006-12-22 - Hiram)
    hgsql -h genome-testdb hgcentraltest \
	-e "update dbDb set orderKey = 465 where name = 'fr1';"


#########################################################################
##  BLASTZ SWAP MEDAKA oryLat1 to Fr1 (DONE - 2007-01-11 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/fr1/bed/blastz.oryLat1.swap
    cd /cluster/data/fr1/bed/blastz.oryLat1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/oryLat1/bed/blastz.fr1.2006-12-22/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > swap.log 2>&1 &
    #	real    22m4.224s

#########################################################################
## MEDAKA ORYLAT1 BLASTZ (WORKING - 2007-01-11 - Hiram)
##	The above swap doesn't contain the the Medaka chrUn scaffolds
#	and it would be good to have the Fr1 alignments back on Medaka
##	So, to get them, run this blastz of the Medaka chroms and chrUn
##	Scaffolds, then swap it back to Medaka, no Db loads as we are
##	going to extract the chrUn chains and nets out of this business

    cat << '_EOF_' > DEF
# Fugu vs. Medaka chroms and chrUn in scaffolds

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Fugu fr1
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
#       20,397 scaffolds, largest 1.1M bases, smallest 107 bases
SEQ1_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ1_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
SEQ1_LIMIT=10

# QUERY: Medaka oryLat1, chrUn in scaffolds
#       36,494 scaffolds, 25 chroms
SEQ2_DIR=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/oryLat1/oryLat1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/oryLat1/oryLat1UnScaffolds.sizes
SEQ2_LIFT=/san/sanvol1/scratch/oryLat1/chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=0
SEQ2_LIMIT=10

BASE=/cluster/data/fr1/bed/blastz.oryLat1.2007-01-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-stop=net -bigClusterHub=pk > do.log 2>&1 &

##########################################################################
## Swap Stickleback gasAcu1 alignments back to fr1
##	(DONE - 2006-12-11 - Hiram)
    mkdir /cluster/data/fr1/bed/blastz.gasAcu1.swap
    cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > do.log 2>&1 &

    time doBlastzChainNet.pl \
	/cluster/data/gasAcu1/bed/blastz.fr1.2006-12-08/DEF \
	-continue=net -chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -bigClusterHub=pk > net.log 2>&1 &
    #	real    24m33.761s
    #	user    0m0.125s
    #	sys     0m0.080s

    ssh hgwdev
    cd /cluster/data/fr1/bed/blastz.gasAcu1.swap
    time nice -n 19 featureBits fr1 chainGasAcu1Link \
	> fb.fr1.chainGasAcu1Link.txt 2>&1 &
    #	148005715 bases of 315518167 (46.909%) in intersection

#######################################################################
## Stickleback gasAcu1  BLASTZ (WORKING - 2007-01-11 - Hiram)
##	The above swap doesn't contain the the Stickleback random contigs
##	and it would be good to have the Fr1 alignments back on Stickleback
##	So, to get them, run this blastz of the Stickleback chroms and random
##	contigs, then swap it back to Stickleback, no Db loads as we are
##	going to extract the random chains and nets out of this business
##	(WORKIN - 2007-01-11 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11
    cd /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11
    cat << '_EOF_' > DEF
# Fugu vs. Stickleback chroms and randoms in scaffolds

# Try "human-fugu" (more distant, less repeat-killed than mammal) params
# +M=50:
BLASTZ_H=2000
BLASTZ_Y=3400
BLASTZ_L=6000
BLASTZ_K=2200
BLASTZ_M=50
BLASTZ_Q=/cluster/data/blastz/HoxD55.q

# TARGET: Fugu fr1
#       Align to the scaffolds, results lifed up to chrUn.sdTrf coordinates
#       20,397 scaffolds, largest 1.1M bases, smallest 107 bases
SEQ1_DIR=/san/sanvol1/scratch/fr1/chrUn.sdTrf.2bit
SEQ1_LEN=/san/sanvol1/scratch/fr1/chrom.sizes
SEQ1_CTGDIR=/san/sanvol1/scratch/fr1/fr1.UnScaffolds.sdTrf.2bit
SEQ1_CTGLEN=/san/sanvol1/scratch/fr1/scaffold.sizes
SEQ1_LIFT=/san/sanvol1/scratch/fr1/UnScaffolds/ordered.lft
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
SEQ1_LIMIT=10

# QUERY: Stickleback gasAcu1 chroms and chrUn in contigs
#       The largest chrom is 32M bases, the largest contig 418,670 bases
#       The smallest chrom chrM is 15,742 bases, smallest contig 60 bases
#       there are 5,115 chroms and contig pieces
#       total size 1,089,690,673 bases
#       47107538 N's 1042583135 real 834274605 upper 208308530 lower
SEQ2_DIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.2bit
SEQ2_LEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.sdTrf.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/gasAcu1/gasAcu1.randomContigs.sdTrf.sizes
SEQ2_LIFT=/san/sanvol1/scratch/gasAcu1/chrUn.extraCloneGap.lift
SEQ2_CHUNK=1000000
SEQ2_LIMIT=20
SEQ2_LAP=0

BASE=/cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time doBlastzChainNet.pl -verbose=2 DEF -smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/fr1GasAcu1Un \
	-stop=net -bigClusterHub=pk > do.log 2>&1 &
XXXX - running 2007-01-11 20:55
    time doBlastzChainNet.pl -verbose=2 DEF -smallClusterHub=pk \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-blastzOutRoot /cluster/bluearc/fr1GasAcu1Un \
	-continue=cat -stop=net -bigClusterHub=pk > cat-net.log 2>&1 &
    #	real    170m43.012s
    ## An interesting failure in this one that did not stop it:
    #	HgStepManager: executing step 'chainMerge'.
    # ssh -x kolossus nice 'chainMergeSort
    # /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/run/chain/*.chain |
    # nice gzip -c >
    # /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/fr1.gasAcu1.all.chain.gz'
    #	bash: /bin/nice: Argument list too long

    ## To work-around this:
    ssh kolossus
    cd /cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/axtChain/run/chain
    chainMergeSort *.chain | gzip -c > ../../fr1.gasAcu1.all.chain.gz
    cd ../..
    chainSplit chain fr1.gasAcu1.all.chain.gz
    time ./netChains.csh > netChains.out 2>&1 &
    #	real    12m14.200s
    ## OK, that is sufficient to get going with the swap
    mkdir /cluster/data/gasAcu1/bed/blastz.fr1.swap
    cd /cluster/data/gasAcu1/bed/blastz.fr1.swap
    time doBlastzChainNet.pl -verbose=2 \
	/cluster/data/fr1/bed/blastz.gasAcu1.2007-01-11/DEF \
	-chainMinScore=2000 -chainLinearGap=loose \
	-tRepeats=windowmaskerSdust -qRepeats=windowmaskerSdust \
	-swap -stop=net -bigClusterHub=pk > swap.log 2>&1 &

###########################################################################
## Chicken/Fugu chain/net swap - (DONE - 2007-03-12 - Hiram)
    mkdir /cluster/data/fr1/bed/blastz.galGal3.swap
    cd /cluster/data/fr1/bed/blastz.galGal3.swap
    time doBlastzChainNet.pl -verbose=2 -qRepeats=windowmaskerSdust \
	/cluster/data/galGal3/bed/blastz.fr1.2007-03-09/DEF \
	-bigClusterHub=pk -chainMinScore=5000 -chainLinearGap=loose \
	-swap > swap.log  2>&1 &
    #	real    3m14.494s
    cat fb.fr1.chainGalGal3Link.txt
    #	33536400 bases of 315518167 (10.629%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT (DONE - 2007-04-09 - Hiram)
    # This will find repeats within the genome that should not be matched
    # against. Uses 11-mers.
    # Use -repMatch=128 (based on size -- for human we use 1024, and
    # fugu size is ~12% of human judging by gapless fr2 vs. hg18
    # genome sizes from featureBits.

    ssh kolossus
    blat /cluster/data/fr1/fr1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=/cluster/data/fr1/11.ooc -repMatch=128
    #	Wrote 5850 overused 11-mers to /cluster/data/fr1/11.ooc

    cp -p /cluster/data/fr1/11.ooc /cluster/bluearc/fr1

#########################################################################
# CLEANUP OF DANRER2 BLASTZ (DONE, 2007-06-25, hartera)
    ssh kkstore02
    cd /cluster/store5/Fugu_Rubripes_V3/bed/blastz.danRer2.swap/
    # split chain files can be recreated from all.chain files so remove chains
    rm -r chainAR
    cd ..
    gzip axtChrom/*.axt 
    # removed about 0.9 G of data

