#########################################################################
# Sorex unguiculatus -- Common Shrew
# Broad sorAra1 2X release

#########################################################################
# Download sequence (2006-03-05 kate)

    ssh kkstore05
    mkdir -p /cluster/store12/sorAra1
    ln -s /cluster/store12/sorAra1 /cluster/data/sorAra1 
    cd /cluster/data/sorAra1
    mkdir downloads
    cd downloads
    wget -r -nd ftp://ftp.broad.mit.edu/pub/assemblies/mammals/commonShrew/sorAra1

#########################################################################
# Create .ra file and run makeGenomeDb.pl
    ssh hgwdev
    cd /cluster/data/sorAra1
    cat << '_EOF_' >sorAra1.config.ra
# Config parameters for makeGenomeDb.pl:
db sorAra1
clade mammal
genomeCladePriority 36
scientificName Sorex araneus 
commonName Shrew
assemblyDate June 2006
assemblyLabel Broad Institute sorAra1 (Draft_v2)
orderKey 100
# search Entrez nucleotide for 'sorex araneus mitochondrion complete genome, use GI #'
# not found (only sorex unguiculatus)
mitoAcc none
fastaFiles /cluster/data/sorAra1/downloads/assembly.bases.gz
agpFiles /cluster/data/sorAra1/downloads/assembly.agp
qualFiles /cluster/data/sorAra1/downloads/scaffold.lifted.qac
dbDbSpeciesDir shrew
'_EOF_'

    makeGenomeDb.pl -verbose=2 -stop=seq sorAra1.config.ra >& makeGenomeDb.out &

    # Need dbDb entry for name lookup
    hgsql -e 'INSERT INTO dbDb (name, description, nibPath, organism, \
        defaultPos, active, orderKey, genome, scientificName, \
        htmlPath, hgNearOk, hgPbOk, sourceName) \
        VALUES("sorAra1", "June 2006", "/gbdb/sorAra1", "Shrew", \
        "", 0, 100, "Shrew", \
        "Sorex araneus", "", 0, 0, \
        "Broad Institute sorAra (Draft_v2)")' -h localhost hgcentraltest

################################################
## WINDOWMASKER (2006-03-04 kate)
    ssh kkstore05
    cd /cluster/data/sorAra1/bed/
    # note: only runs on kolossus, due to network connection to NCBI
    # requested Andy hardcode this.
    nice /cluster/bin/scripts/doWindowMasker.pl sorAra1 \
        -workhorse=kolossus >& wmRun.log &

    ln -s WindowMasker.2007-03-05 WMRun
    mv wmRun.log WMRun
    cd WMRun

    # upper-case n's left by WM (request to Andy to fix this)
    twoBitToFa sorAra1.wmsk.sdust.2bit stdout | tr n N | \
            faToTwoBit stdin /cluster/data/sorAra1/sorAra1.2bit

    # stats on masking
    cd /cluster/data/sorAra1
    twoBitToFa sorAra1.2bit stdout | nice faSize stdin >& faSize.log &
# 2936119008 bases (1103254311 N's 1832864697 real 1116732269 upper 716132428 lower) in 262057 sequences in 1 files
# Total size: mean 11204.1 sd 25927.2 min 203 (scaffold_107701) max 791662 (scaffold_254556) median 4789

    # 39.1% masked
    # Mouse and rat are ~43%, from RM.  TRF gives them an extra 2%, skip here

    mkdir -p /san/sanvol1/scratch/sorAra1
    cp -p sorAra1.2bit chrom.sizes /san/sanvol1/scratch/sorAra1

    #	load this table after creating a db (DONE - 2008-10-21 - Hiram)
    cd /hive/data/genomes/sorAra1/bed/WindowMasker.2007-03-05
    time hgLoadBed sorAra1 windowmaskerSdust windowmasker.sdust.bed.gz
    #	Loaded 11038904 elements of size 3
    #	real    3m20.424s

################################################
# DOWNLOADS (2007-06-05 kate)

    ssh kkstore05
    cd /cluster/data/sorAra1
    mkdir bigZips
    cd bigZips
    nice twoBitToFa ../sorAra1.2bit sorAra1.fa
    cp ../downloads/assembly.agp sorAra1.agp
    nice gzip sorAra1.fa sorAra1.agp
    md5sum *.gz > md5sum.txt

    ssh hgwdev
    set d = /usr/local/apache/htdocs/goldenPath
    set bd = /cluster/data/sorAra1
    cp $d/felCat3/bigZips/README.txt $bd/bigZips
    # EDIT
    mkdir -p $d/sorAra1/bigZips
    ln -s $bd/bigZips/{*.gz,md5sum.txt,README.txt} $d/sorAra1/bigZips
   
##############################################################################
# creating DB to make it easier to work with this (DONE - 2008-10-20 - Hiram)
    cd /hive/data/genomes/sorAra1/downloads
    qaToQac assembly.quals.gz stdout \
    | qacAgpLift assembly.agp stdin sorAra1.quals.qac
    cd /hive/data/genomes/sorAra1
    # edit sorAra1.config.ra to set:
    #	qualFiles /cluster/data/sorAra1/downloads/sorAra1.quals.qac
    makeGenomeDb.pl -workhorse=hgwdev -continue=agp -stop=agp \
	sorAra1.config.ra > makeAgp.log 2>&1
    time makeGenomeDb.pl -workhorse=hgwdev -continue=db -stop=db \
	sorAra1.config.ra > makeDb.log 2>&1
    #	real    27m48.147s
    time makeGenomeDb.pl -workhorse=hgwdev -continue=dbDb -stop=dbDb \
	sorAra1.config.ra > makeDbDb.log 2>&1
    #	real    0m1.294s

##############################################################################
## Repeat Masker (DONE - 2008-10-20 - Hiram)
    screen	# to manage this several day job
    mkdir /hive/data/genomes/sorAra1/bed/repeatMasker
    cd /hive/data/genomes/sorAra1/bed/repeatMasker
    time $HOME/kent/src/hg/utils/automation/doRepeatMasker.pl \
	-workhorse=hgwdev -bigClusterHub=swarm \
	-buildDir=`pwd` sorAra1 > do.log 2>&1 &
    #	real    258m6.902s

    twoBitToFa sorAra1.rmsk.2bit stdout | faSize stdin > faSize.rmsk.txt
# 2936119008 bases (1103254311 N's 1832864697 real 1411267433 upper 421597264
# lower) in 262057 sequences in 1 files
# 14.36 masked total, %23.00 masked rea

#########################################################################
# SIMPLE REPEATS TRF (DONE - 2008-10-21 - Hiram)
    screen # use a screen to manage this job
    mkdir /hive/data/genomes/sorAra1/bed/simpleRepeat
    cd /hive/data/genomes/sorAra1/bed/simpleRepeat
    # 
    time $HOME/kent/src/hg/utils/automation/doSimpleRepeat.pl \
	-buildDir=/cluster/data/sorAra1/bed/simpleRepeat sorAra1 > do.log 2>&1 &
    #	real    140m0.001s
    cat fb.simpleRepeat
    #	32481599 bases of 1832864697 (1.772%) in intersection

    #	after RM run is done, add this mask:
    cd /hive/data/genomes/sorAra1
    rm sorAra1.2bit
    twoBitMask sorAra1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed \
	sorAra1.rmTrf.2bit
    #	can safely ignore warning about >=13 fields in bed file

    twoBitToFa sorAra1.rmTrf.2bit stdout \
	| faSize stdin > sorAra1.rmTrf.faSize.txt
# 2936119008 bases (1103254311 N's 1832864697 real 1410119486 upper 422745211
# lower) in 262057 sequences in 1 files
# %14.40 masked total, %23.06 masked real
    #	leaving the windowmasker masked sequence in place, do not
    #	overwrite with this one
    #	link to gbdb
    #	ln -s `pwd`/sorAra1.2bit /gbdb/sorAra1

#########################################################################
# prepare for kluster runs (DONE _ 2008-10-16 - Hiram)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	sorAra1: 1832864697
    # thus: 1024 * 1832864697/2881421696 = 651
    #	rounding up to 700 for a bit more conservative masking
    cd /hive/data/genomes/sorAra1
    time blat sorAra1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=sorAra1.11.ooc -repMatch=700
    #	Wrote 26119 overused 11-mers to sorAra1.11.ooc
    #	real    1m57.876s

    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/sorAra1
    cp -p sorAra1.2bit chrom.sizes sorAra1.11.ooc \
	/hive/data/staging/data/sorAra1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
# add NCBI identifiers to the dbDb (DONE - 2008-10-21 - Hiram)
    hgsql -e 'update dbDb set
sourceName="Broad Institute sorAra1 (NCBI project 20325, ABRP01000000)" where name="sorAra1";' hgcentraltest

############################################################################
#  sorAra1 - Shrew - Ensembl Genes version 51  (DONE - 2008-12-02 - hiram)
    ssh kkr14u06
    cd /hive/data/genomes/sorAra1
    cat << '_EOF_' > sorAra1.ensGene.ra
# required db variable
db sorAra1
# do we need to translate geneScaffold coordinates
geneScaffolds yes
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl -ensVersion=51 sorAra1.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/sorAra1/bed/ensGene.51
    featureBits sorAra1 ensGene
    # 19400431 bases of 1832864697 (1.058%) in intersection

 *** All done!  (through the 'makeDoc' step)
 *** Steps were performed in /hive/data/genomes/sorAra1/bed/ensGene.51

############################################################################
