#!/bin/csh -f # set emacs mode
exit; # don't actually run this like a script :)

# Cat Mar 2006 Broad Assembly

# DOWNLOAD SEQUENCE 
    ssh kkstore01
    mkdir /cluster/store2/felCat1
    cd /cluster/data
    ln -s /cluster/store2/felCat1 felCat1
    cd /cluster/data/felCat1
    mkdir jkStuff bed
    mkdir broad
    cd broad
    wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.agp"
    wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.bases.gz"
#    wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/assembly.quals.gz"
#    wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/unplaced.fasta.gz"
#    wget "http://www.broad.mit.edu/ftp/pub/assemblies/mammals/cat/felCat1/unplaced.qual.gz"

    gunzip assembly.bases.gz 
    faSize assembly.bases
# 1,642,698,377 bases (0 N's 1642698377 real 1642698377 upper 0 lower) in 817956 sequences in 1 files
# Total size: mean 2008.3 sd 1931.8 min 50 (contig_713825) max 146926 (contig_534097) median 1394
# N count: mean 0.0 sd 0.0
# U count: mean 2008.3 sd 1931.8
# L count: mean 0.0 sd 0.0

    /cluster/bin/x86_64/agpAllToFaFile assembly.agp assembly.bases ../felCat1.fa
    faSize ../felCat1.fa

# 4045535322 bases (2402836945 N's 1642698377 real 1642698377 upper 0 lower) in 217790 sequences in 1 files
# Total size: mean 18575.4 sd 53162.6 min 56 (scaffold_206924) max 1539409 (scaffold_217569) median 1960
# N count: mean 11032.8 sd 31089.2
# U count: mean 7542.6 sd 24907.3
# L count: mean 0.0 sd 0.0

    /cluster/bin/scripts/agpToLift < assembly.agp > ../jkStuff/assembly.lft

# PARTITION SCAFFOLDS FOR REPEATMASKER RUN 
    # glom the tiny scaffolds up into ~50k collections 
    ssh kkstore01
    cd /cluster/data/felCat1
    mkdir chunks50k
    faSplit about broad/assembly.bases 50000 chunks50k/chunk_
    cd chunks50k
    for i in 0 1 2 3 4 5 6 7 8 9; do mkdir $i; mv *$i.fa $i; done

# RUN REPEAT MASKER 
    # make the run directory, output directory, and job list
    ssh kkstore01
    cd /cluster/data/felCat1
    tcsh
    cat << '_EOF_' > jkStuff/RMCat
#!/bin/csh -fe

cd $1
/bin/mkdir -p /tmp/felCat1/$2
/bin/cp $3 /tmp/felCat1/$2/
pushd /tmp/felCat1/$2
/cluster/bluearc/RepeatMasker/RepeatMasker -s -species cat $2
popd
/bin/cp /tmp/felCat1/$2/$2.out ./
/bin/rm -fr /tmp/felCat1/$2/*
/bin/rmdir --ignore-fail-on-non-empty /tmp/felCat1/$2
/bin/rmdir --ignore-fail-on-non-empty /tmp/felCat1
'_EOF_'
    # << this line makes emacs coloring happy
    chmod +x jkStuff/RMCat

    mkdir RMRun RMOut
    cd RMOut
    mkdir 0 1 2 3 4 5 6 7 8 9
    cd ../chunks50k
    for i in */*.fa
    do
	e=`dirname $i`
	d="/cluster/data/felCat1"
	c=`basename $i`
    echo "../jkStuff/RMCat $d/RMOut/$e $c  {check in line+ $d/chunks50k/$i} {check out line+ $d/RMOut/$e/$c.out}"
    done > ../RMRun/RMJobs

    # do the run
    ssh kk
    cd /cluster/data/felCat1/RMRun
    para create RMJobs
    para try, check, push, check,...

# Completed: 31654 of 31654 jobs
# CPU time in finished jobs:   16603766s  276729.43m  4612.16h  192.17d  0.527 y
# IO & Wait Time:                362052s    6034.20m   100.57h    4.19d  0.011 y
# Average job time:                 536s       8.93m     0.15h    0.01d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            6222s     103.70m     1.73h    0.07d
# Submission to last job:        215588s    3593.13m    59.89h    2.50d

    # Lift up the split-scaffold .out's to scaffold .out's
    ssh kkstore01
    cd /cluster/data/felCat1/RMOut
    for i in 0 1 2 3 4 5 6 7 8 9; do echo $i; liftUp -nohead $i.out ../jkStuff/assembly.lft warn $i/*.fa.out>/dev/null; done
    head -3 0.out > felCat1.out
    for i in 0 1 2 3 4 5 6 7 8 9; do tail +4 $i.out; done >> felCat1.out
    # Load the .out files into the database with:
    ssh hgwdev
    hgLoadOut felCat1 /cluster/data/felCat1/RMOut/felCat1.out
    # Lots of strange perc messages
    # Loading up table felCat1_rmsk
    # note: 6 records dropped due to repStart > repEnd

    hgsql felCat1 -e 'rename table felCat1_rmsk to rmsk'
    # Fix up the indices too:
    hgsql felCat1 -e 'drop index bin       on rmsk;  drop index genoStart on rmsk;  drop index genoEnd   on rmsk; \
                  create index bin       on rmsk (genoName(16), bin); \
                  create index genoStart on rmsk (genoName(16), genoStart); \
                  create index genoEnd   on rmsk (genoName(16), genoEnd);'


# EXTRACTING GAP INFO FROM BLOCKS OF NS (DONE 11/5/04 angie)
    ssh kkstore01
    mkdir /cluster/data/felCat1/bed/fakeAgp
    cd /cluster/data/felCat1/bed/fakeAgp
    faGapSizes ../../downloads/scaffolds.fasta  -niceSizes=5,10,20,25,30,40,50,100,250,500,1000,10000,100000
    # Wow, all block of N's seem to be exactly 100bp long.  
    # hgFakeAgp's default -minContigGap of 25 will be fine.  
    hgFakeAgp ../../downloads/scaffolds.fasta fake.agp
    ssh hgwdev
    hgLoadGap -unsplit felCat1 /cluster/data/felCat1/bed/fakeAgp/fake.agp


# SIMPLE REPEATS (TRF) 
    ssh kkstore01
    mkdir /cluster/data/felCat1/bed/simpleRepeat
    cd /cluster/data/felCat1/bed/simpleRepeat
    nice trfBig -trf=/cluster/bin/i386/trf ../../felCat1.fa  /dev/null -bedAt=simpleRepeat.bed -tempDir=/tmp
    # check on this with
    tail -f trf.log

    # Load this into the database as so
    ssh hgwdev
    hgLoadBed felCat1 simpleRepeat /cluster/data/felCat1/bed/simpleRepeat/simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql


# FILTER SIMPLE REPEATS (TRF) INTO MASK 
    # make a filtered version of the trf output: 
    # keep trf's with period <= 12:
    ssh kkstore01
    cd /cluster/data/felCat1/bed/simpleRepeat
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed


# MASK FA USING REPEATMASKER AND FILTERED TRF FILES 
    ssh kolossus
    cd /cluster/data/felCat1
    /cluster/bin/x86_64/maskOutFa -soft felCat1.fa bed/simpleRepeat/trfMask.bed felCat1.simple.fa
    /cluster/bin/x86_64/maskOutFa -softAdd felCat1.simple.fa RMOut/felCat1.out felCat1.masked.fa
    mv felCat1.fa felCat1.unmasked.fa
    # Now clean up the unmasked split scaffolds to avoid confusion later.
    rm -r chunks500k scaffoldsSplit.fa jkStuff/scaffoldsSplit.lft


# CREATING DATABASE
    # Create the database.
    ssh hgwdev
    # Make sure there is at least 5 gig free for the database
    df -h /var/lib/mysql
Filesystem            Size  Used Avail Use% Mounted on
# /dev/sdc1             1.8T  915G  746G  56% /var/lib/mysql
    hgsql '' -e 'create database felCat1'

# STORE SEQUENCE AND ASSEMBLY INFORMATION 
    # Translate to 2bit
    ssh kkstore01
    cd /cluster/data/felCat1
    /cluster/bin/x86_64/faToTwoBit felCat1.masked.fa felCat1.2bit
    # Make chromInfo.tab.
    mkdir bed/chromInfo
    twoBitInfo felCat1.2bit stdout | awk '{printf "%s\t%s\t/gbdb/felCat1/felCat1.2bit\n",$1,$2;}' \
	| sort > bed/chromInfo/chromInfo.tab

    # Make symbolic a link from /gbdb/felCat1 to the 2bit.
    ssh hgwdev
    mkdir -p /gbdb/felCat1
    ln -s /cluster/data/felCat1/felCat1.2bit /gbdb/felCat1/
