# for emacs: -*- mode: sh; -*-

 
# This file describes building the browser database for the archaeal
# species Methanosarcina acetivorans.
#
# if this is the first time you are making your own hgwdev browser, need to do 
# cd ~/kent/src/, then a make

# DOWNLOAD SEQUENCE FROM GENBANK (DONE 10/2/05)

    mkdir /cluster/store5/archae/ferrAcid1
    ln -s /cluster/store5/archae/ferrAcid1 /cluster/data/ferrAcid1
    cd /cluster/data/ferrAcid1
    cp /projects/lowelab/db/Bacteria/Ferroplasma_acidarmanus/Ferr_acid* .
    mv Ferr_acid.fa ferrAcid1.fa
    grep ">" ferrAcid1.fa
    # Edit header of ferrAcid1.fa seqs to '>chr >plasmid_pNRC100 >plasmid_pNRC200'
   
    faToTwoBit ferrAcid1.fa ferrAcid1.2bit 

    mkdir /gbdb/ferrAcid1
    ln -s /cluster/data/ferrAcid1/ferrAcid1.2bit /gbdb/ferrAcid1/ferrAcid1.2bit

# CREATE DATABASES AND A BUNCH OF INITIAL STUFF (DONE 10/2/05)

    ssh hgwdev
    echo 'create database ferrAcid1' | hgsql ''
    cd /cluster/data/ferrAcid1

    faSize -detailed ferrAcid1.fa > chrom.sizes
    echo "create table grp (PRIMARY KEY(NAME)) select * from hg16.grp" | hgsql ferrAcid1
    echo 'INSERT INTO dbDb (name, description, nibPath, organism, defaultPos, active, orderKey, genome, scientificName, htmlPath, hgNearOk) values ("ferrAcid1", "Jun 2005 Draft", "/gbdb/ferrAcid1", "Ferroplasma acidarmanus", "chr:500000-550000", 1, 280, "Ferroplasma acidarmanus","Ferroplasma acidarmanus Fer1", "/gbdb/ferrAcid1/html/description.html", 0);' | hgsql hgcentraltest
    echo 'INSERT INTO defaultDb (genome, name) values ("Ferroplasma acidarmanus", "ferrAcid1");' | hgsql hgcentraltest
    echo 'INSERT INTO genomeClade (genome, clade, priority) values ("Ferroplasma acidarmanus", "archaea",85);'  | hgsql hgcentraltest

# CREATE CHROMINFO TABLE 
  ssh hgwdev
  cd /cluster/data/ferrAcid1

   cp ~baertsch/kent/src/hg/lib/chromInfo.sql .
   hgsql ferrAcid1 < chromInfo.sql
   echo "load data local infile 'chrom.sizes' into table chromInfo" | hgsql ferrAcid1
   echo "update chromInfo set fileName = '/gbdb/ferrAcid1/ferrAcid1.2bit'" | hgsql ferrAcid1

    cd ~/kent/src/hg/makeDb/trackDb

    # add the trackDb directories
    mkdir -p archae/ferrAcid1
    cvs add archae/ferrAcid1
    cvs commit archae/ferrAcid1

    make DBS=ferrAcid1


# GC20BASE (DONE 10/2/05)
    mkdir -p /cluster/data/ferrAcid1/bed/gc20Base
    cd /cluster/data/ferrAcid1/bed/gc20Base
    hgGcPercent -wigOut -doGaps -file=stdout -overlap=10 -win=20 ferrAcid1 /cluster/data/ferrAcid1/ | wigEncode stdin gc20Base.wig gc20Base.wib

    cd /cluster/data/ferrAcid1/bed/gc20Base
    mkdir /gbdb/ferrAcid1/wib
    ln -s `pwd`/gc20Base.wib /gbdb/ferrAcid1/wib
    hgLoadWiggle -pathPrefix=/gbdb/ferrAcid1/wib ferrAcid1 gc20Base gc20Base.wig
    #	verify index is correct:
    hgsql ferrAcid1 -e "show index from gc20Base;"
    #	should see good numbers in Cardinality column


# TANDEM REPEAT MASKER (DONE 10/2/05)

    ssh hgwdev
    mkdir -p /cluster/data/ferrAcid1/bed/simpleRepeat
    cd /cluster/data/ferrAcid1
    trfBig ferrAcid1.fa /dev/null -bedAt=/cluster/data/ferrAcid1/bed/simpleRepeat/chr.bed
    cd /cluster/data/ferrAcid1/bed/simpleRepeat
    hgLoadBed ferrAcid1 simpleRepeat *.bed -sqlTable=/cluster/home/lowe/kent/src/hg/lib/simpleRepeat.sql

# NO TIGR GENES
# genome not available at http://www.tigr.org/tigr-scripts/CMR2/gene_attribute_form.dbi

# DESCRIPTION PAGE (DONE 10/4/05)

    # Write ~/kent/src/hg/makeDb/trackDb/archae/ferrAcid1/description.html
    chmod a+r ~/kent/src/hg/makeDb/trackDb/archae/ferrAcid1/description.html
    mkdir -p /cluster/data/ferrAcid1/html/
    cp ~/kent/src/hg/makeDb/trackDb/archae/ferrAcid1/description.html /cluster/data/ferrAcid1/html/description.html
    mkdir -p /gbdb/ferrAcid1/html
    ln -s /cluster/data/ferrAcid1/html/description.html /gbdb/ferrAcid1/html/

# MULTIZ with therAcid, therVolc, picrTorr
# DONE (10/13/05), kpollard

    cd /cluster/data/ferrAcid1/bed/
    mkdir conservation
    cd conservation
    cp /cluster/data/metAce1/bed/conservation/HoxD55.q .
    cp ../../chr.fa ferrAcid1.chr
    sed s/Contig/ferrAcid1.Contig/ ferrAcid1.chr > temp
    mv temp ferrAcid1.chr
    cp /cluster/data/therAcid1/bed/conservation/therAcid1.chr .
    cp /cluster/data/picrTorr1/bed/conservation/picrTorr1.chr .
    cp /cluster/data/therVolc1/bed/conservation/therVolc1.chr .
    faToNib therVolc1.chr therVolc1.chr.nib
    faToNib therAcid1.chr therAcid1.chr.nib
    faToNib picrTorr1.chr picrTorr1.chr.nib
    faToTwoBit ferrAcid1.chr ferrAcid1.2bit
    #chrom sizes
    faSize -detailed *.chr > chrom.sizes
    #make fa for each contig
    faSplit byname ferrAcid1.chr ./
    ls ferrAcid1.Contig*.fa > contignames.txt
    cat contignames.txt | gawk '{print substr($1,11,length($1))}' > contigs.txt
    foreach f (`cat contignames.txt`)
	set b=$f:r
	cat $f | gawk '{if(/>ferrAcid1.Contig/){print $0;}else{print toupper($0);}}' > temp
	mv temp $f
	faToNib $f $b.nib
    end

    #blastz 
    foreach f (`cat contignames.txt`)
	set b=$f:t:r
	echo $b
	blastz $f therAcid1.chr Q=HoxD55.q > ${b}-therAcid1.lav 
	blastz $f picrTorr1.chr Q=HoxD55.q > ${b}-picrTorr1.lav
	blastz $f therVolc1.chr Q=HoxD55.q > ${b}-therVolc1.lav
    end

    foreach f(*.lav)
        set b=$f:r
	echo $b
	lavToAxt $f . . $b.axt
    end

    foreach f(`cat contignames.txt`)
	set b=$f:r
	echo $b
	axtBest ${b}-therAcid1.axt $b -winSize=500 -minScore=5000 ${b}-therAcid1-best.axt
	axtBest ${b}-picrTorr1.axt $b -winSize=500 -minScore=5000 ${b}-picrTorr1-best.axt
	axtBest ${b}-therVolc1.axt $b -winSize=500 -minScore=5000 ${b}-therVolc1-best.axt
    end

    foreach f(*-best.axt)
	set b=`basename $f -best.axt`
	echo $b
	axtToMaf $f chrom.sizes chrom.sizes $b.maf
    end
     
    #multiz
    #remove extra header lines
    foreach f(*.maf)
	cat $f | gawk 'BEGIN{getline; print $0; getline; getline; getline; getline;}{print $0;}' > temp
	mv temp $f
    end

    foreach f(`cat contignames.txt`)
	set b=$f:r
	echo $b
	multiz ${b}-therAcid1.maf ${b}-picrTorr1.maf - > ${b}-therAcid1-picrTorr1.maf
	multiz ${b}-therVolc1.maf ${b}-therAcid1-picrTorr1.maf - > ${b}-therAcid1-picrTorr1-therVolc1.maf
    end

    #phyloHMM
    foreach f(`cat contignames.txt`)
	set b=$f:r
	echo $b
	msa_view -i MAF -M $f -o SS ${b}-therAcid1-picrTorr1-therVolc1.maf > $b.ss
    end
    
    foreach f(`cat contigs.txt`)
	set b=$f:r
	cat ferrAcid1.$b.ss | gawk '{if(/^NAMES/){print "NAMES = ferrAcid1,therVolc1,therAcid1,picrTorr1";} else{print $0;}}' > temp
	mv temp $b.ss
	phyloFit -i SS ferrAcid1.$b.ss -t "(ferrAcid1,(picrTorr1,(therAcid1,therVolc1)))" -o ${b}_FaTaPtTv
    end

    #Contig 169 has the largest NTUPLES (1051) so use it for starting mod
    # it shows GC=0.205868+0.188380=0.394248
    #add GC content to next call
    foreach f(`cat contigs.txt`)
	set b=$f:r
	echo $b
	phastCons ferrAcid1.$b.ss Contig169_FaTaPtTv.mod \
	--gc 0.3942 --target-coverage 0.7 --estimate-trees ${b} \
	--expected-lengths 25 --no-post-probs --ignore-missing \
	--nrates 1,1
    end

    #average with phyloBoot to get cons.mod and noncons.mod
    #ls *.cons.mod > cons.txt
    #phyloBoot --read-mods '*cons.txt' --output-average ave.cons.mod > cons_summary.txt
    #ls *.noncons.mod > noncons.txt
    #phyloBoot --read-mods '*noncons.txt' --output-average ave.noncons.mod > noncons_summary.txt
    #PROBLEM: Can't do this with different numbers of species in each mod
    #Again, just use Contig169 model

    #get rid of SS files with no data
    foreach f(`cat contignames.txt`)
	set b=$f:r
	wc -l $b.ss | gawk '{if($1==0){print "rm "$2}}' >> rmjobs
    end
    chmod +x rmjobs
    rmjobs

    foreach f(*.cons.mod)
	set b=$f:r:r
	echo $b
	phastCons $b.ss Contig169.cons.mod,Contig169.noncons.mod \
	--target-coverage 0.7 --expected-lengths 25 \
	--viterbi ${b}_ferrAcid1-elements.bed --score \
	--require-informative 0 --seqname $b > ${b}_cons.dat
	wigEncode ${b}_cons.dat ${b}_phastCons.wig ${b}_phastCons.wib
    end

    #combine phastCons elements into 1 bed file
    cat Contig*.bed > phastCons.bed

    #move data
    mkdir wib
    mv Contig*_phastCons.wib wib/.
    mv Contig*_phastCons.wig wib/.
    ln -s /cluster/data/ferrAcid1/bed/conservation/wib/*.wib /gbdb/ferrAcid1/wib
    mkdir /gbdb/ferrAcid1/pwMaf
    mkdir -p otherSpp/therAcid1 otherSpp/picrTorr1 otherSpp/therVolc1
    foreach f(`cat contignames.txt`)
	set b=$f:r
	echo $b
	mv ferrAcid1.${b}-picrTorr1.maf otherSpp/picrTorr1/$b.maf
	mv ferrAcid1.${b}-therAcid1.maf otherSpp/therAcid1/$b.maf
	mv ferrAcid1.${b}-therVolc1.maf otherSpp/therVolc1/$b.maf
    end
    ln -s /cluster/data/ferrAcid1/bed/conservation/otherSpp/picrTorr1 /gbdb/ferrAcid1/pwMaf/picrTorr1_pwMaf
    ln -s /cluster/data/ferrAcid1/bed/conservation/otherSpp/therVolc1 /gbdb/ferrAcid1/pwMaf/therVolc1_pwMaf
    ln -s /cluster/data/ferrAcid1/bed/conservation/otherSpp/therAcid1 /gbdb/ferrAcid1/pwMaf/therAcid1_pwMaf
    mkdir multiz
    foreach f(`cat contignames.txt`)
	set b=$f:r
	echo $b
	mv ferrAcid1.${b}-therAcid1-picrTorr1-therVolc1.maf multiz/$b.maf
    end
    ln -s /cluster/data/ferrAcid1/bed/conservation/multiz /gbdb/ferrAcid1/multizFaTaPtTv
    #get rid of wig files with no data
    rm rmjobs
    foreach f(`cat contignames.txt`)
	set b=$f:r
	wc -l wib/${b}_phastCons.wig | gawk '{if($1==0){print "rm "$2}}' >> rmjobs
    end
    chmod +x rmjobs
    rmjobs

    #load
    hgLoadWiggle ferrAcid1 phastCons /cluster/data/ferrAcid1/bed/conservation/wib/Contig*_phastCons.wig
    hgLoadMaf -warn ferrAcid1 multizFaTaPtTv
    hgLoadMaf -warn ferrAcid1 picrTorr1_pwMaf -pathPrefix=/gbdb/ferrAcid1/pwMaf/picrTorr1_pwMaf
    hgLoadMaf -warn ferrAcid1 therVolc1_pwMaf -pathPrefix=/gbdb/ferrAcid1/pwMaf/therVolc1_pwMaf
    hgLoadMaf -warn ferrAcid1 therAcid1_pwMaf -pathPrefix=/gbdb/ferrAcid1/pwMaf/therAcid1_pwMaf
    hgLoadBed ferrAcid1 phastConsElements phastCons.bed 

    #trackDb
    cd ~/kent/src/hg/makeDb/trackDb/archae/ferrAcid1
    #trackDb.ra entry
    # track multizFaTaPtTv
    # shortLabel Conservation
    # longLabel Methanogen multiz alignments
    # group compGeno
    # priority 10.0
    # visibility pack
    # type wigMaf 0.0 1.0
    # maxHeightPixels 100:40:11
    # wiggle phastCons
    # yLineOnOff Off
    # autoScale Off
    # pairwise pwMaf
    # speciesOrder therAcid1 picrTorr1 therVolc1
    cvs add trackDb.ra
    cvs commit -m "New multiz track" trackDb.ra
    #html page
    cvs add multizFaTaPtTv.html
    cvs commit -m "Details page for multiz track" multizFaTaPtTv.html


# GENBANK PROTEIN-CODING GENES ()

    ssh hgwdev
    mkdir /cluster/data/ferrAcid1/genbank
    cd /cluster/data/ferrAcid1/genbank
    cp /projects/lowelab/db/Bacteria/Ferroplasma_acidarmanus/ .
    
    mv NC_003552.gbk ferrAcid1.gbk
    # Create 3 files to assist parsing of the genbank
    # 1. for a bed file
    echo 'chr
start
end
gene
1000
strand' > ferrAcid1-params-bed.txt
    # 2. for the peptide parts
    echo 'gene
translation' > ferrAcid1-params-pep.txt
    # 3. for the other gene information
    echo 'gene
product
note' > ferrAcid1-params-xra.txt
    # Now extract the genes and information:
    gbArchaeGenome ferrAcid1.gbk ferrAcid1-params-bed.txt ferrAcid1-genbank-cds.bed
    gbArchaeGenome ferrAcid1.gbk ferrAcid1-params-pep.txt ferrAcid1-genbank-cds.pep
    gbArchaeGenome ferrAcid1.gbk ferrAcid1-params-xra.txt ferrAcid1-genbank-cds.xra
    hgLoadBed ferrAcid1 gbProtCode ferrAcid1-genbank-cds.bed
    hgsql ferrAcid1 < ~/kent/src/hg/lib/pepPred.sql
    hgsql ferrAcid1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table pepPred to gbProtCodePep | hgsql ferrAcid1
    echo rename table minGeneInfo to gbProtCodeXra | hgsql ferrAcid1
    echo load data local infile \'ferrAcid1-genbank-cds.pep\' into table gbProtCodePep | hgsql ferrAcid1
    echo load data local infile \'ferrAcid1-genbank-cds.xra\' into table gbProtCodeXra | hgsql ferrAcid1

#genbank to genePred

csh
tawk '{print $1,$2,$3,$4,$5,$6,$2,$3,0,1,$3-$2,0}' ferrAcid1-genbank-cds.bed | bedToGenePred stdin tmp.gp
tawk '{print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,substr($1,3,4),name2,"cmpl","cmpl",0}' tmp.gp  > tmp2.gp
join -t "     " -o 1.1,1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 2.3 1.13 1.14 1.15  tmp2.gp ferrAcid1-genbank-cds.xra > ferrAcid1.gp

# GENBANK rRNA GENES ()
    ssh hgdev
    cd /cluster/data/ferrAcid1/genbank
    gbArchaeGenome -kind=rRNA ferrAcid1.gbk ferrAcid1-params-bed.txt ferrAcid1-rrnas.bed
    echo 'gene product NA' > ferrAcid1-params-rrna-xra.txt
    gbArchaeGenome -kind=rRNA ferrAcid1.gbk ferrAcid1-params-rrna-xra.txt ferrAcid1-rrnas-xra.txt
    hgLoadBed ferrAcid1 gbRRNA ferrAcid1-rrnas.bed
    hgsql ferrAcid1 < ~/kent/src/hg/lib/minGeneInfo.sql
    echo rename table minGeneInfo to gbRRNAXra | hgsql ferrAcid1
    echo load data local infile \'ferrAcid1-rrnas-xra.txt\' into table gbRRNAXra | hgsql ferrAcid1

# COG STUFF
    # Cut and paste http://www.ncbi.nlm.nih.gov/cgi-bin/COG/palox into emacs (COG list)
    # and save as cogpage.txt
    awk '{printf("%s\t%s\n",$6,$5)}' < cogpage.txt | sed -e 's/\[//' -e 's/\]//' > cogs.txt
    rm cogpage.txt
    # Now we have the basic list of cogs and the letter code for each one.
    

# TODD LOWE tRNA GENES ()

    # This one is a bed 6+ file created by hand of 46 tRNAs and 1 pseudo tRNA by Todd
    # Lowe.  See ~/kent/src/hg/lib/loweTrnaGene.as for a description of the fields.
    # **Showing the tRNAScanSE instructions would be nice in the future.  
    ssh hgwdev
    mkdir /cluster/data/ferrAcid1/bed/loweTrnaGene
    cd /cluster/data/ferrAcid1/bed/loweTrnaGene
    hgLoadBed -tab ferrAcid1 loweTrnaGene ferrAcid1-lowe-trnas.bed -sqlTable=~/kent/src/hg/lib/loweTrnaGene.sql

# TODD LOWE snoRNA GENES ()
    # This is a bed 6 file created by hand.
    ssh hgwdev
    mkdir /cluster/data/ferrAcid1/bed/loweSnoGene
    cd /cluster/data/ferrAcid1/bed/loweSnoGene
    hgLoadBed -tab ferrAcid1 loweSnoGene ferrAcid1-snos.bed


