# This file describes how the sp051015 and proteins051015 databases were
# made using UniProt database files 
# and a few other external databases.

# STARTED ON 10/12/05, sp051015 and proteins051015 DONE ON 10/28/05.

# FIRST PARSE SWISS-PROT RAW DATA FILES AND BUILD sp051015 DB.

# Make subdirectories under /cluster/store11/swissProt

	mkdir /cluster/store11/swissprot/051015
	mkdir /cluster/store11/swissprot/051015/build
	mkdir /cluster/store11/swissprot/051015/tabFiles
	ln -s /cluster/store11/swissprot/051015 /cluster/data/swissprot/051015
	
# mkSwissProtDB.sh is updated so that it takes date from command argument
# instead of using today's date and removed uniprot_trembl_new.dat, since
# it is no longer available.

# Run mkSwissProtDB.sh to parse Swiss-Prot raw input files.

	ssh kkstore02
	cd /cluster/data/swissprot/051015
	~/src/hg/protein/mkSwissProtDB.sh 051015

# Go to hgwdev and run mkSwissProtDB.sh again to create and load sp051015 database

      ssh hgwdev
	cd /cluster/data/swissprot/051015
	~/src/hg/protein/mkSwissProtDB.sh 051015

# This part process the variant splice proteins.
# First build variant splice protein tables.
# Get all variant isoform human protein sequences

  ssh hgwdev
  cd /cluster/data/swissprot/051015/build
  
  gzip -d *varsplic.fasta.gz

# for sp051015, uniprot_trembl_varsplic.fasta does not exist
#  faToTab -type=protein uniprot_trembl_varsplic.fasta splicTrembl.tab

  faToTab -type=protein uniprot_sprot_varsplic.fasta splicSprot.tab

#  cat splicTrembl.tab splicSprot.tab >varProtein.tab

  cat splicSprot.tab >varProtein.tab

  hgsql sp051015 < ~/src/hg/lib/varProtein.sql
  hgsql sp051015 -e 'load data local infile "varProtein.tab" into table varProtein'

# load data into the protein table too.

  hgsql sp051015 -e 'load data local infile "varProtein.tab" into table protein'

  cat varProtein.tab |cut -f 1>j1
  paste j1 j1 >varDisplayId.tab
  hgsql sp051015 -e 'load data local infile "varDisplayId.tab" into table displayId'
  
  cut -f 1 j1|sed -e 's/-/\t/g' >j2
  paste j1 j2 >splicProt.tab


# build varAcc table

  hgsql sp051015 -N -e 'select acc from varProtein' |sort -u >j1
  cat j1 |sed -e 's/-/\t/' >j2
  paste j1 j2 >varAcc.tab
  rm j1 j2

  hgsql sp051015 -e 'drop table varAcc'
  hgsql sp051015 < ~/src/hg/lib/varAcc.sql
  hgsql sp051015 -e 'load data local infile "varAcc.tab" into table varAcc'

#  hgsql kgHg17FTemp -e 'drop table splicProt'
#  hgsql kgHg17FTemp <~/src/hg/lib/splicProt.sql
#  hgsql kgHg17FTemp -e 'load data local infile "splicProt.tab" into table splicProt'

#  hgsql kgHg17FTemp -N -e \
#  'select varAcc, varProtein.val from sp051015.varProtein,splicProt,proteins051015.spXref3 where accession=parAcc and varProtein.acc=splicProt.varAcc and division="9606"'| \
#   awk '{print ">" $1;print $2}' >humanVarProt.fa

# Create tableDescriptions table.

  makeTableDescriptions sp051015 ~/kent/src/hg/protein/spToDb/spDbTables.as

# NEXT BUILD proteins051015 DB

# make sure the following programs are available:
  
pfamXref

# Updated mkProteinsDB.sh script then run it to create proteins051015 DB.
# Please note that due to external DB changes, this script could not be successfully
# run from start to end.  Several manual steps were used.  mkProteinsDB.sh is updated
# to reflect the actual manual steps used.

   ~/src/hg/protein/mkProteinsDB.sh 051015

# create indice of the spXref2 table, took about 10 minutes each.
# check to see if any index already existed

    hgsql proteins051015 -e 'CREATE INDEX accession ON spXref2(accession(24))'
    hgsql proteins051015 -e 'CREATE INDEX displayID ON spXref2(displayID(24))'
    hgsql proteins051015 -e 'CREATE INDEX extAC ON spXref2(extAC(24))'

#    hgsql proteins051015 -e 'CREATE INDEX ii2 ON spXref3(displayID(12))'
#    hgsql proteins051015 -e 'CREATE INDEX ii3 ON spXref3(hugoSymbol(12))'

# Add variant splice protein data into sp051015 tables.

fgrep ">" uniprot_sprot_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t1\t/g' \
| sed -e 's/) /\t/g' >varProtTemp.tab

# fgrep ">" uniprot_trembl_varsplic.fasta|sed -e 's/>//g' | sed -e 's/ (/\t2\t/g' \
| sed -e 's/) /\t/g' >>varProtTemp.tab

# cat uniprot_sprot_varsplic.fasta |sed -e 's/ (/\n/g' |grep -v iso >j.tmp
# cat uniprot_trembl_varsplic.fasta|sed -e 's/ (/\n/g' |grep -v iso >>j.tmp
# faToTab -type=protein j.tmp varProtein.tab
# rm j.tmp
# hgsql sp051015 -e 'load data local infile "varProtein.tab" into table protein'

hgsql sp051015 -e 'drop table varProtTemp'
hgsql sp051015 < ~/src/hg/lib/varProtTemp.sql

hgsql sp051015 -e 'load data local infile "varProtTemp.tab" into table varProtTemp'

spToProteinsVar 051015
hgsql proteins051015 -e 'load data local infile "spXref3Var.tab" into table spXref3'

# not needed, done previously
# cut -f 1,2 spXref3Var.tab >j.tmp
# hgsql sp051015 -e 'load data local infile "j.tmp" into table displayId'
# rm j.tmp

hgsql sp051015 -e 'drop table varProtTemp'

# Update hgncXref table.
# A deficiency in pbHgnc was found and fixed.  Reran it again and
# load the new data into proteins051015.hgncXref too.

pbHgnc 060115

hgsql proteins051015
delete from hgncXref;
load data local infile "hgncXref.tab" into table hgncXref;
quit;

