# for emacs: -*- mode: sh; -*-
#########################################################################
# hivmn1 DATABASE BUILD (STARTED, 10/16/07, DONE 10/19/07, Fan)

    ssh hiv1
    mkdir -p /cluster/store12/medical/hiv/hivmn1
    cd /cluster/store12/medical/hiv/hivmn1

# Get HIV MN base sequence (1,653 bases) from GSID

# This sequence is re-created using the data sheets (page 7 131 - 7 133) 
# from VaxGene document, provided by Phil Berman.
# The data sheets are manually read and entered into raw txt file first, then
# manually checked and corrected.  The base genome covers the DNA sequence
# starting from the bases that has the initial AA translation and ending
# at the codon of TAA.  The final result is stored in file, hivmn1.fa.

    faToTab hivmn1.fa stdout|toUpper stdin hivmn1.tab

# translate to nib
    ln -s /cluster/store12/medical/hiv/hivmn1 ~/hivmn1
    cd ~/hivmn1
    mkdir nib
    ln -s hivmn1.fa chr1.fa
    faToNib chr1.fa nib/chr1.nib

# CREATING DATABASE 

# Create the hivmn1 database.
    
    echo 'create database hivmn1' | hgsql hiv1

# CREATING GRP TABLE FOR TRACK GROUPING (DONE 10/16/07)
    
    echo "create table grp (PRIMARY KEY(NAME)) select * from hiv1.grp" \
      | hgsql hivmn1

# STORING O+O SEQUENCE AND ASSEMBLY INFORMATION  

    # Make symbolic links from /gbdb/hivmn1/nib to the real nibs.

    mkdir -p /gbdb/hivmn1/nib
    ln -s /cluster/store12/medical/hiv/hivmn1/nib/chr1.nib /gbdb/hivmn1/nib

    # Load /gbdb/hivmn1/nib paths into database and save size info.
    hgsql hivmn1  < ~/src/hg/lib/chromInfo.sql

    cd ~/hivmn1
    hgNibSeq -preMadeNib hivmn1 /gbdb/hivmn1/nib chr1.fa
    echo "select chrom,size from chromInfo" | hgsql -N hivmn1 > chrom.sizes

# MAKE HGCENTRALHIV1 ENTRY AND TRACKDB TABLE FOR HIVMN 
    echo 'insert into defaultDb values("HIV MN (GP120)", "hivmn1");' \
      | hgsql -h localhost hgcentralhiv1

    echo 'insert into dbDb values("hivmn1", "Oct. 2007", \
          "/gbdb/hivmn1/nib", "HIV MN (GP120)", "chr1", 1, 2030, \
    "HIV MN (GP120)","Human immunodeficiency     virus 1", \
    "/gbdb/hivmn1/html/description.html", 0, 0, " sequence as of Oct., 2007");' \
      | hgsql hgcentralhiv1 -h localhost

    echo 'insert into genomeClade values("HIV MN (GP120)", "other", 110);'\
      | hgsql hgcentralhiv1 -h localhost

    # Edit that makefile to add hivmn1 in all the right places

    cd ~src/hg/makeDb/trackDb
    vi makefile

    make update
    make alpha

    cvs update
    cvs commit makefile

# MAKE HGCENTRALHIV1 BLATSERVERS ENTRY FOR HIVMN
    ssh hiv1
    echo 'insert into blatServers values("hivmn1", "hiv1", "17786", "1", "0"); \
          insert into blatServers values("hivmn1", "hiv1", "17787", "0", "0");' \
      | hgsql hgcentralhiv1 -h localhost

# CREATE TRACKDB TABLE

    hgsql hivmn1 < ~/src/hg/lib/trackDb.sql
    hgsql hivmn1 -e "insert into trackDb select * from hiv1.trackDb"

# copy over tables from hiv1

    mysqldump -d hiv1 dnaSeq -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into dnaSeq select * from hiv1.dnaSeq"

    mysqldump -d hiv1 aaSeq -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into aaSeq select * from hiv1.aaSeq"
    
    mysqldump -d hiv1 gsidSubjInfo -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into gsidSubjInfo select * from hiv1.gsidSubjInfo"

    mysqldump -d hiv1 gsidClinicRec -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into gsidClinicRec select * from hiv1.gsidClinicRec"
    
    mysqldump -d hiv1 gsIdXref -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into gsIdXref select * from hiv1.gsIdXref"

    mysqldump -d hiv1 vax004Msa -u medcat -p$HGPSWD|hgsql hivmn1
    hgsql hivmn1 -e "insert into vax004Msa select * from hiv1.vax004Msa"

# CREATE VAX004 TRACK 

# get vax004 sequences
    hgsql hivmn1 -N -e 'select * from dnaSeq where id like "%U%"' >vax004.tab

# create .fa file
    tabToFa vax004

    mkdir -p /gbdb/hivmn1/vax004
    cp -p vax004.fa /gbdb/hivmn1/vax004/vax004.fa

    hgLoadSeq -replace hivmn1 /gbdb/hivmn1/vax004/vax004.fa

# BLAT
    gfClient -minScore=200 -minIdentity=80 -nohead hiv1.cse.ucsc.edu 17785  /gbdb/hivmn1/nib \
    -out=psl -t=dna -q=dna vax004.fa vax004.psl

# count the result
    wc *.psl
    cut -f 10 vax004.psl |wc
    cut -f 10 vax004.psl |sort -u |wc

# load the psl result into vax004 table
    hgLoadPsl hivmn1 vax004.psl

# hgLoadPsl has some file permission problem.  Finish this by manually load the psl.tab file.
    Hgsql hivmn -e 'load data local infile "psl.tab" into table vax004'

# CREATE HIVGENE TRACK

    mkdir -p /cluster/store12/medical/hiv/hivmn1/hivGene
    cd /cluster/store12/medical/hiv/hivmn1/hivGene

# clean up files from previous build
#    rm *.psl *.bed *.tab *.fa *.tmp

# Get corresponding DNA gemomic sequences from hiv1 base genome.
    hgsql hiv1 -N -e \
    'select h.name, substring(b.seq, h.chromStart, h.chromEnd-h.chromStart) from baseSeq b, hivGene h where b.id="hiv1"' >hSeq.tmp

# keep only the genes/regions within the gp120 region
    fgrep V hSeq.tmp >hSeq.tab
    fgrep vpu hSeq.tmp >>hSeq.tab

    fgrep gp120 hSeq.tmp >>hSeq.tab
    fgrep RRE hSeq.tmp >>hSeq.tab

# create .fa file from .tab file
    tabToFa hSeq

# BLAT the sequences agains hivmn1 base genome
    gfClient -minScore=10 -minIdentity=35 -nohead hiv1.cse.ucsc.edu 17787 \
    /gbdb/hivmn1/nib -out=psl -t=dna -q=dna hSeq.fa jtemp.psl

# Keep only the best ones
    pslReps -singleHit -minAli=0.35 -nohead jtemp.psl hSeq.psl jtemp.psr

# load the BLAT result into a temp psl track
    hgLoadPsl hivmn1 hSeq.psl

# NEED TO MANUALLY LOAD THE psl.tab, BECAUSE hgLoadPsl HAS PERMISSION PROBLEM ON hiv1.
    hgsql hivmn1 -e 'delete from hSeq'
    hgsql hivmn1 -e 'load data local infile "psl.tab" into table hSeq'

# create .bed file for hivGene
    hgsql hivmn1 -N -e 'select "chr1", tStart, tEnd,  qName from hSeq ' >hivGene.bed

# load the bed file into hivGene table, needs to load manually bed.tab manually due 
# to permission problem.
    hgLoadBed hivmn1 hivGene hivGene.bed

# remove temp files
    rm jtemp.* *.tmp psl.tab

# CREATE INTERPRO TRACK

   cd /cluster/store12/medical/hiv/hivmn1
   mkdir interPro
   cd interPro

# get all HIV-1 domain sequences
    getInterProFa interProXrefHiv1 interProHiv1.fa

# BLAT against base genome
    gfClient localhost 17786 /gbdb/hivmn1/nib -out=psl -t=dnax -q=prot  interProHiv1.fa interProHiv1.psl

# load it into a temp table
    hgLoadPsl hivmn1 table=testPsl interProHiv1.psl
# finish above manually.

# create bed file from this temp table
    hgsql hivmn1 -N -e \
    'select "chr1", tStart, tEnd, qName from testPsl where (tEnd-tStart)/3/qSize>0.42' \
    > interProHiv1.bed

# load the bed file into the track table
    hgLoadBed hivmn1 interPro interProHiv1.bed

# drop the temp table.
    hgsql hivmn1 -e 'drop table testPsl'

# CREATE CONSERVATION TRACKS

    mkdir -p /cluster/store12/medical/hiv/hivmn1/conservation
    cd /cluster/store12/medical/hiv/hivmn1/conservation

# create the .wig file and .fa file of the consensus sequence.
    gsidMsa hivmn1 vax004Msa MN 166 vax004Cons.wig vax004Consensus.fa

# encode and load the wig file
    wigEncode vax004Cons.wig stdout vax004Cons.wib \
    | hgLoadWiggle hivmn1 vax004Cons stdin

# copy .wib file to /gbdb
    mkdir -p /gbdb/hivmn1/wib
    cp vax004Cons.wib /gbdb/hivmn1/wib

# do the same for protein conservation track

    mkdir aa
    cd aa

# create .wig file
    gsidAaMsa2 hivmn1 vax004Msa MN 166 vax004AaCons.wig vax004AaConsensus.fa

# encode and load the .wib file   
    wigEncode vax004AaCons.wig stdout vax004AaCons.wib \
    | hgLoadWiggle hivmn1 vax004AaCons stdin

    cp vax004AaCons.wib /gbdb/hivmn1/wib

# CREATE MAF TRACKS

    mkdir -p /cluster/store12/medical/hiv/hivmn1/msa
    cd /cluster/store12/medical/hiv/hivmn1/msa

# create a script file, doall

    hgsql hivmn1 -N -e \
    'select id from dnaSeq where id like "%U%"'\
    |sed -e 's/ss/do1 ss/g' >doall

# create one line script file, do1, with the following line in it:

    hgsql hivmn1 -N -e  "select id, seq from vax004Msa where id='${1}'"

    chmod +x do*

# run the script to get the .tab file with all MSA sequences of VAX004
    doall >mn1.tab
# convert .tab into .fa file
    tabToFa mn1

# grab the base alignment sequence
    echo ">hivmn1" >mn1.aln
    hgsql hivmn1 -N -e 'select seq from vax004Msa where id="MN"'  >> mn1.aln

# prepare an interium file, jjAll.mfa
    cat mn1.aln mn1.fa >jjAll.mfa
    echo = >>jjAll.mfa

# Run xmfaToMafMn1 to create a precursor file for the final .maf

    xmfaToMafMn1 jjAll.mfa j.out  org1=hivmn1
    cat j.out|sed -e 's/\./_/g'|sed -e 's/_chr/\.chr/g' >chr1.tmp

    rm jjAll.mfa j.out

    cat chr1.tmp |sed -e 's/ss_U/U/g' >chr1.maf

# copy .maf to /gbdb.

    mkdir -p  /gbdb/hivmn1/vax004Maf 
    cp chr1.maf /gbdb/hivmn1/vax004Maf -p
    echo before load
    hgLoadMaf hivmn1 vax004Maf

# create another copy for protein MAF.

    mkdir -p  /gbdb/hivmn1/vax004AaMaf 
    cp -p chr1.maf /gbdb/hivmn1/vax004AaMaf
    hgLoadMaf hivmn1 vax004AaMaf
