# for emacs: -*- mode: sh; -*-
# Lottia gigantea 

#########################################################################
# DOWNLOAD SEQUENCE (todo
    ssh kolossus
    mkdir /hive/data/genomes/lotGig1
    ln -s /hive/data/genomes/lotGig1 /cluster/data
    mkdir /cluster/data/lotGig1/jgi
    cd /cluster/data/lotGig1/jgi

    wget --timestamping \
ftp://ftp.jgi-psf.org/pub/JGI_data/Lottia_gigantea/v1.0/Lotgi1_assembly_scaffolds.fasta.gz
    md5sum Lotgi1* > assembly.md5sum

# no quals yet
#    qaToQac assembly.quals.gz stdout | qacAgpLift assembly.agp stdin lotGig1.qual.qac

   faSize Lotgi1_assembly_scaffolds.fasta.gz
# 359512207 bases (60610637 N's 298901570 real 298901570 upper 0 lower) in 4475 sequences in 1 files
# Total size: mean 80337.9 sd 438106.3 min 1000 (sca_18165) max 9386848 (sca_1) median 3622
# N count: mean 13544.3 sd 53307.5
# U count: mean 66793.6 sd 391964.5
# L count: mean 0.0 sd 0.0
# %0.00 masked total, %0.00 masked real


#########################################################################
# Create .ra file and run makeGenomeDb.pl  (todo..
    ssh hgwdev
    screen
    cd /cluster/data/lotGig1
cat << _EOF_ >lotGig1.config.ra
# Config parameters for makeGenomeDb.pl:
db lotGig1
clade deuterostome
genomeCladePriority 35
scientificName  Lottia gigantea
commonName Gastropod snail
assemblyDate Jul. 2007
assemblyLabel JGI
orderKey 236.5
#mitoAcc AJ222767
mitoAcc none
fastaFiles /cluster/data/lotGig1/jgi/Lotgi1_assembly_scaffolds.fasta.gz
fakeAgpMinContigGap 25
fakeAgpMinScaffoldGap 25 
# agpFiles /cluster/data/lotGig1/broad/assembly.agp
# qualFiles /cluster/data/lotGig1/broad/lotGig1.qual.qac
dbDbSpeciesDir mollusc
_EOF_

    makeGenomeDb.pl -workhorse kolossus -verbose=2 lotGig1.config.ra > makeGenomeDb.out 2>&1 &

# 'ctl-a ctl -d' returns to previous shell
cut -f 2 chrom.sizes | ave stdin
# Q1 7169.500000
# median 13298.000000
# Q3 55788.500000
# average 866164.007952
# min 3002.000000
# max 88675666.000000
# count 3144
# total 2723219641.000000
# standard deviation 5316243.688364


#########################################################################
# REPEATMASKER (not done)
    screen # use a screen to manage this job
    mkdir /cluster/data/lotGig1/bed/repeatMasker
    cd /cluster/data/lotGig1/bed/repeatMasker
    /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \
	-buildDir=/cluster/data/lotGig1/bed/repeatMasker    \
	lotGig1 > do.log 2>&1 &

# new parasol, lots of crashes, no times

    /cluster/bin/scripts/doRepeatMasker.pl -species "tarsier" \
	-continue cat -buildDir=/cluster/data/lotGig1/bed/repeatMasker    \
	lotGig1 > do2.log 2>&1 &


    time nice -n +19 featureBits lotGig1 rmsk > fb.lotGig1.rmsk.txt 2>&1 &
    # 1154651023 bases of 2771976320 (41.654%) in intersection


#########################################################################
# SIMPLE REPEATS TRF (not done)
    screen # use a screen to manage this job
    mkdir /cluster/data/lotGig1/bed/simpleRepeat
    cd /cluster/data/lotGig1/bed/simpleRepeat
    # 
    doSimpleRepeat.pl -buildDir=/cluster/data/lotGig1/bed/simpleRepeat \
	lotGig1 > do.log 2>&1 &

    #### When done
    ssh pk
    para time
    # Completed: 51 of 51 jobs
    # CPU time in finished jobs:      24985s     416.41m     6.94h    0.29d
    # 0.001 y
    # IO & Wait Time:                   101s       1.69m     0.03h    0.00d
    # 0.000 y
    # Average job time:                 492s       8.20m     0.14h    0.01d
    # Longest finished job:            3887s      64.78m     1.08h    0.04d
    # Submission to last job:          3911s      65.18m     1.09h    0.05d

    featureBits lotGig1 simpleRepeat
    # 50851363 bases of 2771976320 (1.834%) in intersection

    #	after RM run is done, add this mask:
    cd /cluster/data/lotGig1
    twoBitMask lotGig1.rmsk.2bit -add bed/simpleRepeat/trfMask.bed lotGig1.2bit

    twoBitToFa lotGig1.2bit stdout | faSize stdin
#    3183347966 bases (411371646 N's 2771976320 real 1619032005 upper
#    1152944315 lower) in 659020 sequences in 1 files
#    Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427
#    (scaffold_0) median 1645
#    N count: mean 624.2 sd 1398.1
#    U count: mean 2456.7 sd 5229.3
#    L count: mean 1749.5 sd 2909.8
#    %36.22 masked total, %41.59 masked real

    twoBitToFa lotGig1.rmsk.2bit stdout | faSize stdin
# 3183347966 bases (411371646 N's 2771976320 real 1619660149 upper 1152316171
# lower) in 659020 sequences in 1 files
# Total size: mean 4830.4 sd 8873.5 min 600 (scaffold_659019) max 329427
# (scaffold_0) median 1645
# N count: mean 624.2 sd 1398.1
# U count: mean 2457.7 sd 5230.7
# L count: mean 1748.5 sd 2908.4
# %36.20 masked total, %41.57 masked real


    # Link to it from /gbdb
    ln -s /cluster/data/lotGig1/lotGig1.2bit /gbdb/lotGig1/lotGig1.2bit

    # mkdir /san/sanvol1/scratch/lotGig1
    cp /cluster/data/lotGig1/lotGig1.2bit /san/sanvol1/scratch/lotGig1
    cp /cluster/data/lotGig1/chrom.sizes /san/sanvol1/scratch/lotGig1


###########################################################################
# prepare for kluster runs (not done)
    # compare to size of real bases to adjust the repMatch
    #	hg18: 2881421696
    #	lotGig1: 2768536343
    # thus: 1024 * 2768536343/2881421696 = 983
    #	rounding up to 1000 for a bit more conservative masking
    cd /hive/data/genomes/lotGig1
    time blat lotGig1.2bit \
	/dev/null /dev/null -tileSize=11 -makeOoc=lotGig1.11.ooc -repMatch=1000
    #	Wrote 31947 overused 11-mers to lotGig1.11.ooc
    #	real    2m30.155s
    #	and staging data for push to kluster nodes
    mkdir /hive/data/staging/data/lotGig1
    cp -p lotGig1.2bit chrom.sizes lotGig1.11.ooc \
	/hive/data/staging/data/lotGig1
    #	request to cluster admin to push this to the kluster nodes
    #	/scratch/data/

###########################################################################
