# for emacs: -*- mode: sh; -*-

# Drosophila willistoni -- TIGR "CAF1" via Eisen's 12-fly site

# THIS IS ONLY TO GET MASKED SEQUENCE -- NOT A BROWSER AT THIS POINT


#########################################################################
# DOWNLOAD SEQUENCE (DONE 9/26/06 angie)
    ssh kkstore05
    mkdir /cluster/store12/droWil1
    ln -s /cluster/store12/droWil1 /cluster/data/droWil1
    mkdir /cluster/data/droWil1/downloads
    cd /cluster/data/droWil1/downloads
    wget http://rana.lbl.gov/drosophila/caf1/dwil_caf1.tar.gz
    tar xvzf dwil_caf1.tar.gz
    cd dwil
    faSize scaffolds.bases
#236693122 bases (12173174 N's 224519948 real 224519948 upper 0 lower) in 14927 sequences in 1 files
#Total size: mean 15856.7 sd 302548.7 min 868 (scaffold_189452) max 16660200 (scaffold_181130) median 1508
#N count: mean 815.5 sd 8057.0
#U count: mean 15041.2 sd 299560.9
#L count: mean 0.0 sd 0.0


#########################################################################
# MAKE GENOME DB *UP TO DB STEP ONLY* (DONE 9/26/06 angie)
    ssh kkstore05
    cd /cluster/data/droWil1
    cat > droWil1.config.ra <<EOF
# Config parameters for makeGenomeDb.pl:
db droWil1
clade insect
scientificName Drosophila willistoni
assemblyDate Feb. 2006
assemblyLabel TIGR CAF1
orderKey 57
mitoAcc none
fastaFiles /cluster/data/droWil1/downloads/dwil/scaffolds.bases
agpFiles /cluster/data/droWil1/downloads/dwil/assembly.agp
dbDbSpeciesDir drosophila
EOF

    # Stop at db step so we can use featureBits, but don't do dbDb and trackDb
    # because we're not building an actual browser for now.
    makeGenomeDb.pl droWil1.config.ra -stop=db \
      >& makeGenomeDb.log & tail -f makeGenomeDb.log


#########################################################################
# REPEATMASKER (DONE 9/26/06 angie)
    ssh kkstore05
    # Run -debug to create the dir structure and preview the scripts:
    doRepeatMasker.pl droWil1 -verbose 3 -debug
    # Run it for real and tail the log:
    doRepeatMasker.pl droWil1 -species drosophila -verbose 3 \
      >& /cluster/data/droWil1/bed/RepeatMasker.2006-09-26/do.log &
    tail -f /cluster/data/droWil1/bed/RepeatMasker.2006-09-26/do.log
    # RepeatMasker and lib version from do.log:
#    March 20 2006 (open-3-1-5) version of RepeatMasker
#CC   RELEASE 20060315;                                            *
    # No previous assembly to compare to.
    featureBits -chrom=scaffold_181130 droWil1 rmsk
#1276946 bases of 16594458 (7.695%) in intersection


#########################################################################
# SIMPLE REPEATS (TRF) (DONE 9/26/06 angie)
    ssh kolossus
    nice tcsh
    mkdir /cluster/data/droWil1/bed/simpleRepeat
    cd /cluster/data/droWil1/bed/simpleRepeat
    twoBitToFa ../../droWil1.unmasked.2bit stdout \
    | trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
      -bedAt=simpleRepeat.bed -tempDir=/tmp \
    >& trf.log & tail -f trf.log
    # ~60 minutes (longer than D. mel, must be because of the scaffolds)

    # Make a filtered version for sequence masking:
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    # Load unfiltered repeats into the database:
    ssh hgwdev
    hgLoadBed droWil1 simpleRepeat \
      /cluster/data/droWil1/bed/simpleRepeat/simpleRepeat.bed \
      -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql
    # No previous assembly to compare to.
    featureBits -chrom=scaffold_181130 droWil1 simpleRepeat
#227314 bases of 16594458 (1.370%) in intersection


#########################################################################
# MASK SEQUENCE WITH FILTERED TRF IN ADDITION TO RM (DONE 9/26/06 angie)
    ssh kolossus
    cd /cluster/data/droWil1
    time twoBitMask droWil1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed droWil1.2bit
    # This warning is OK -- the extra fields are not block coordinates.
#Warning: BED file bed/simpleRepeat/trfMask.bed has >=13 fields which means it might contain block coordinates, but this program uses only the first three fields (the entire span -- no support for blocks).
#0.305u 0.421s 0:07.30 9.8%      0+0k 0+0io 0pf+0w

    # Because this is a no-browser build (just masking for alignment)
    # I did not make the usual /gbdb/$db/$db.2bit link.


###########################################################################
# WINDOWMASKER EXPERIMENT (DONE 10/17/06 angie)
    # The droAna3-droWil1 blastz run  was just destroyed by mega-output,
    # even with -chainFilterMinScore=10000 and M=50 (trying M=20 but not
    # too hopeful)... so let's try a de-novo masker before alignment:
    ssh kolossus
    mkdir /cluster/data/droWil1/bed/windowmasker.2006-10-17
    cd /cluster/data/droWil1/bed/windowmasker.2006-10-17
    twoBitToFa /cluster/data/droWil1/droWil1.2bit tmp.fa
    # First, collect counts:
    /cluster/bin/x86_64/windowmasker -mk_counts true -input tmp.fa \
      -output wm.counts
    # Then use those counts to mask sequence:
    time /cluster/bin/x86_64/windowmasker -ustat wm.counts -input tmp.fa \
      -output wm.intervals 
#236.727u 1.626s 4:00.07 99.2%   0+0k 0+0io 0pf+0w
    perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
               if (/^(\d+) - (\d+)/) { \
                 $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
               }' wm.intervals > windowmasker.bed
    # Quick coverage:
    awk '{print $3 - $2;}' windowmasker.bed | total
#78439535
    awk '{print $2;}' ../../chrom.sizes  | total
#236693122
    calc 78439535 / 236693122
#78439535 / 236693122 = 0.331398
    # Make a masked .2bit:
    twoBitMask ../../droWil1.2bit windowmasker.bed ../../droWil1.WM.2bit

    # Now try with -sdust to additionally mask low-complexity sequence:
    time /cluster/bin/x86_64/windowmasker -ustat wm.counts -sdust true \
       -input tmp.fa -output wm.sdust.intervals
#510.488u 2.410s 8:35.66 99.4%   0+0k 0+0io 0pf+0w
    perl -wpe 'if (s/^>lcl\|(\w+).*\n$//) { $chr = $1; } \
               if (/^(\d+) - (\d+)/) { \
                 $s=$1; $e=$2+1; s/(\d+) - (\d+)/$chr\t$s\t$e/; \
               }' wm.sdust.intervals > windowmasker.sdust.bed
    awk '{print $3 - $2;}' windowmasker.sdust.bed | total
#93767264
    calc 93767264 / 236693122
#93767264 / 236693122 = 0.396155
    # Make a masked .2bit (even if we don't end up needing it):
    twoBitMask ../../droWil1.2bit windowmasker.sdust.bed \
      ../../droWil1.WMSDust.2bit
    rm tmp.fa


#########################################################################
# SWAP DM3 CHAIN/NET (DONE 4/6/09 angie)
    mkdir /hive/data/genomes/droWil1/bed/blastz.dm3.swap
    cd /hive/data/genomes/droWil1/bed/blastz.dm3.swap
    doBlastzChainNet.pl -swap -bigClusterHub swarm -smallClusterHub memk \
      -workhorse kolossus \
      /hive/data/genomes/dm3/bed/blastz.droWil1/DEF >& do.log &
    tail -f do.log
    ln -s blastz.dm3.swap /hive/data/genomes/droWil1/bed/blastz.dm3


#########################################################################
