#how to get a new dump of SwissProt variants
#computeSpVars generates names and coordinates (all but gene links)

#make list of variants from Swiss-Prot (make sure featureClass 23 is variant)
hgsql -N uniProt > spVars.txt <<end
select feature.acc, start, end-start, featureType.val, featureId.val from feature, featureType, accToTaxon, featureId where featureClass=23 and featureType=featureType.id and accToTaxon.acc=feature.acc and taxon=9606 and feature.featureId=featureId.id;
end
#need list mapping 3 letter amino acids to 1 letter. (aminoInfoDump from PSU)
#known gene protein map (kgProtMap) has psl data from (blastp)
# with qName being the spId
hgsql -N hg17 > kgProtMapDump.txt <<end
select kgProtMap.* from kgProtMap, uniProt.feature where kgProtMap.qName = uniProt.feature.acc;
end
#table join duplicates; perl script to throw out extra before use
uniqueRows < kgProtMapDump.txt > kgProtMapUniq.txt

#check variables for output and input file names
computeSpVars > errors.txt
#errors.txt will list variants that couldn't be mapped 
#July 18, 2006
#37 gaps, 564 proteins (2228 variants) not in kgProtMap (test one did align)
#found 22389

#to get remaining use protBlat table?
hgsql -N hg17 > protBlatDump.txt <<end
select protBlat.* from protBlat, uniProt.feature where protBlat.qname = uniProt.feature.acc;
end
uniqueRows < protBlatDump.txt > protBlatUniq.txt
rm protBlatDump.txt
#rerun computeSpVars
#found 24339 after using protBlat
#remaining errors are gaps, off end of sequence or edges of blat exons
#too many errors in new variants, disabled use

##########################################################################
#attributes: June 26, 2006 
hgsql hg17 < listSPconnections.sql > listSPconnections.txt
hgsql proteome < listSpXref2.sql > listSpXref2.txt
convertOmimTitle > gvLinkSPomim.txt

hgsql hg17 < listGeneVals.sql > listGeneVals.txt
convertDisPoly > gvLinkSPuni.txt

cat gvLinkSPuni.txt gvLinkSPomim.txt > gvLinkSp.txt
cp gvLinkSp.txt ../../../gv/gvData/

##########################################################################
# rewrote script to do attrs with rest, so no attrs when no coors
# this makes convertDisPoly, convertOmimTitle, *.sql files obsolete
# Oct 2, 2006
#get disease association (disFile) from 
http://www.expasy.org/cgi-bin/lists?humsavar.txt
#get psl file
hgsql -N hg18 > kgProtMapDump.txt <<end
select kgProtMap.* from kgProtMap, uniProt.feature where kgProtMap.qName = uniProt.feature.acc;
end
#table join duplicates; perl script to throw out extra before use
uniqueRows < kgProtMapDump.txt > kgProtMapUniq.txt
#get updated Swiss-Prot data
wget ftp://us.expasy.org/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.dat.gz
#use scripts to prep for loading then
#use createJoinFromFiles to get file for computeSpVars
#set variable for input files etc, then
computeSpVars > errors.out
