#This describes how at least some of the tables in
#hgFixed were created.  This is a database containing
#primarily expression data.  There are two main formats:
#  expRecord.as - This describes the mRNA sources for
#     a series of microarray experiments 
#  expData.as - This describes the measured value 
#     in either absolute or relative ratio terms of
#     each gene/probe/target in a series of microarray
#     experiments.  Each expData is associated with
#     an expRecord, thogh expDatas sometimes share
#     the same expRecord.

#The Human Affy GNF Expression Atlas 2003 Version:
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfHumanU95AllExps gnfHumanU95All /projects/compbio/data/microarray/affyGnfHuman/data_public_U95
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfHumanU95All gnfHumanU95AllRatio -clump=gnfClump.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfHumanU95AllRatio gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95MedianRatio gnfHumanU95MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfHumanU95All gnfHumanU95AllExps gnfU95Median.ra gnfHumanU95Median gnfHumanU95MedianExps -minExps=1

# The Mouse Affy GNF Expression Atlas:
# Create the expRecord tables for U74 a/b/c and the expData table for
# the absolute measurements:
hgGnfMicroarray gnfMouseU74aAllExps gnfMouseU74aAll /projects/compbio/data/microarray/affyGnfMouse/data/data_public_U74
hgGnfMicroarray gnfMouseU74bAllExps gnfMouseU74bAll /projects/compbio/data/microarray/affyGnfMouse/data/U74B_b.txt
hgGnfMicroarray gnfMouseU74cAllExps gnfMouseU74cAll /projects/compbio/data/microarray/affyGnfMouse/data/U74C_b.txt
# Convert these to ratios using the median of medians of 
# cell types as the denominator as so:

cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseU74aAll gnfMouseU74aAllRatio -clump=gnfMouseU74aClump.ra
hgRatioMicroarray gnfMouseU74bAll gnfMouseU74bAllRatio -clump=gnfMouseU74bClump.ra
hgRatioMicroarray gnfMouseU74cAll gnfMouseU74cAllRatio -clump=gnfMouseU74cClump.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseU74aAllRatio gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedianRatio gnfMouseU74aMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74bAllRatio gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedianRatio gnfMouseU74bMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74cAllRatio gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedianRatio gnfMouseU74cMedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseU74aAll gnfMouseU74aAllExps gnfMouseU74aMedian.ra gnfMouseU74aMedian gnfMouseU74aMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74bAll gnfMouseU74bAllExps gnfMouseU74bMedian.ra gnfMouseU74bMedian gnfMouseU74bMedianExps -minExps=1
hgMedianMicroarray hgFixed gnfMouseU74cAll gnfMouseU74cAllExps gnfMouseU74cMedian.ra gnfMouseU74cMedian gnfMouseU74cMedianExps -minExps=1


#The Human GNF Expression Atlas 2 (2004)
#
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfHumanAtlas2AllExps gnfHumanAtlas2All /projects/compbio/data/microarray/geneAtlas2/human/U133A+GNF1B_101402.AD.txt -chip=U133A+GNF1B
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfHumanAtlas2All gnfHumanAtlas2AllRatio -clump=gnfHumanAtlas2Clumps.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfHumanAtlas2AllRatio gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2MedianRatio gnfHumanAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfHumanAtlas2All gnfHumanAtlas2AllExps gnfHumanAtlas2.ra gnfHumanAtlas2Median gnfHumanAtlas2MedianExps -minExps=1

#The Mouse GNF Expression Atlas 2 (2004)
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfMouseAtlas2AllExps gnfMouseAtlas2All /projects/compbio/data/microarray/geneAtlas2/mouse/GNF1M_20030403.AD.txt -chip=GNF1M
# Convert these to ratios using the median of medians of non-cancerous
# cell types as the denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfMouseAtlas2All gnfMouseAtlas2AllRatio -clump=../hgMedianMicroarray/gnfMouseAtlas2.ra
# Take the median value over multiple replicants and put in this table:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfMouseAtlas2AllRatio gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2MedianRatio gnfMouseAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfMouseAtlas2All gnfMouseAtlas2AllExps gnfMouseAtlas2.ra gnfMouseAtlas2Median gnfMouseAtlas2MedianExps -minExps=1

#The Rat GNF Expression Atlas 2 (2004)
# Create the main expRecord table and the expData table for
# the absolute measurements as so:
hgGnfMicroarray gnfRatAtlas2AllExps gnfRatAtlas2All /projects/compbio/data/microarray/geneAtlas2/rat/PivotNoApwithTissues.txt -chip=RG-U34A -ref=http://expression.gnf.org/ratlas
# Convert these to ratios using the median of medians of non-cancerous
# tissues or cell types (in this case, this is all the tissues) as the 
# denominator as so:
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray gnfRatAtlas2All gnfRatAtlas2AllRatio -clump=gnfRatAtlas2Clumps.ra
# Take the median value over multiple replicants and put in this table.
# Use Clumps.ra file renamed as gnfRatAtlas2.ra as this contains all the 
# tissues since there are no cancer tissues in this expression data set:
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed gnfRatAtlas2AllRatio gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2MedianRatio gnfRatAtlas2MedianExps -minExps=1
# Also make a median version of the absolute measurements
hgMedianMicroarray hgFixed gnfRatAtlas2All gnfRatAtlas2AllExps gnfRatAtlas2.ra gnfRatAtlas2Median gnfRatAtlas2MedianExps -minExps=1

# C. elegans life cycle data from the Kim Lab via the Stanford Microarray Database.
cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
hgStanfordMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps /projects/compbio/data/microarray/wormLifeCycle/spots -swap '-trimName=(green)' -suppress=green '-trimTissue=(repeat #?)'
cd ../hgMedianMicroarray
hgMedianMicroarray hgFixed kimWormLifeAllRatio kimWormLifeAllExps kimMed.ra kimWormLifeMedianRatio kimWormLifeMedianExps

# D. melanogaster life cycle data from Arbeitman et al 2002 
# via the Stanford Microarray Database.
cd ~/kent/src/hg/makeDb/hgStanfordMicroarray
# absolute:
hgStanfordMicroarray -geneField="Systematic name" -dataField=CH2I_MEDIAN \
  hgFixed arbFlyLifeAll arbFlyLifeAllExps \
  /projects/compbio/data/microarray/flyLifeCycle/spots
# ratios:
hgStanfordMicroarray -geneField="Systematic name" \
  hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps \
  /projects/compbio/data/microarray/flyLifeCycle/spots
cd ../hgMedianMicroarray
echo "select name,id from arbFlyLifeAllExps" | hgsql -N hgFixed  \
  | sort > arbMed.ra
# edit arbMed.ra to collapse the N=1, N=2 lines.
# median absolute:
hgMedianMicroarray hgFixed arbFlyLifeAll arbFlyLifeAllExps arbMed.ra \
  arbFlyLifeMedian arbFlyLifeMedianExps
# median ratios:
hgMedianMicroarray hgFixed arbFlyLifeAllRatio arbFlyLifeAllExps arbMed.ra \
  arbFlyLifeMedianRatio arbFlyLifeMedianExps
# cvs add and check in arbMed.ra

###########################
# REGENERATING FLY LIFE-CYCLE TABLES. (DONE 5/12/2006 ANDY)
hgsql hgFixed -e "rename table kimWormLifeAllRatio to kimWormLifeAllRatio_old"  
hgsql hgFixed -e "rename table kimWormLifeMedianExps to kimWormLifeMedianExps_old"
hgsql hgFixed -e "rename table kimWormLifeMedianRatio to kimWormLifeMedianRatio_old"

# The scopDes table, which is used by the SuperFamily column in hgNear.
mkdir /cluster/store1/scop
cd /cluster/store1/scop
wget http://scop.mrc-lmb.cam.ac.uk/scop/parse/dir.des.scop.txt_1.63
grep -v '^#' dir.des.scop.txt* > scopDes.txt
hgsql hgFixed < ~/kent/src/hg/lib/scopDes.sql
echo "load data local infile 'scopDes.txt' into table scopDes;" | hgsql hgFixed

# The Yeast Cell Cycle Time Course from Cho RJ et al 1998
cd /cluster/data/sacCer1/download/systematic_results/expression_data
hgGnfMicroarray yeastChoCellCycleExps yeastChoCellCycle  \
	Cho_et_al_full_data.txt -chip=affyYeast \
	-chopName=/ \
	-url=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html \
	-ref=http://www.pnas.org/cgi/content/abstract/95/7/3752 \
	-credit=http://yscdp.stanford.edu/yeast_cell_cycle/cellcycle.html
cd ~/src/hg/makeDb/hgRatioMicroarray
hgRatioMicroarray yeastChoCellCycle yeastChoCellCycleRatio

# Mouse expression data by sex on Affy MOE430A arrays from
# John Rinn (john.rinn@yale.edu) et al.
cd /projects/compbio/data/microarray/rinnEtAl
hgGnfMicroarray mouseRinnSexExps mouseRinnSex rinnEtAlSpots.txt \
    -chip=MOE430A \
    -url=n/a \
    -ref=n/a \
    -credit=n/a
cd ~/kent/src/hg/makeDb/hgRatioMicroarray 
hgRatioMicroarray mouseRinnSex mouseRinnSexRatio
cd ~/kent/src/hg/makeDb/hgMedianMicroarray 
hgMedianMicroarray hgFixed mouseRinnSex mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedian mouseRinnSexMedianExps
hgMedianMicroarray hgFixed mouseRinnSexRatio mouseRinnSexExps mouseRinnSex.ra mouseRinnSexMedianRatio mouseRinnSexMedianExps

# D. melanogaster full euchromatic expression profile (FEEP) -- 
# Stolc et al. 2004.  
1# Loaded up absolute tables directly from files downloaded from 
# http://genome.med.yale.edu/FEEP/FEEP.html --
# see /projects/compbio/data/microarray/flyFEEP/README .
# Extract ratio from absolute:
hgRatioMicroarray flyFeepAll flyFeepAllRatio
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
echo "select description,id from flyFeepAllExps" | hgsql -N hgFixed  \
  | sort > flyFeepMed.ra
# edit flyFeepMed.ra to collapse lines with the same initial character.
# median absolute:
hgMedianMicroarray hgFixed flyFeepAll flyFeepAllExps flyFeepMed.ra \
  flyFeepMedian flyFeepMedianExps
# median ratios:
hgMedianMicroarray hgFixed flyFeepAllRatio flyFeepAllExps flyFeepMed.ra \
  flyFeepMedianRatio flyFeepMedianExps
# cvs add and check in flyFeepMed.ra

# Human data from Shyamsundar R, et al. (2005) Genome Biol 6(3):R22
mkdir -p /projects/compbio/data/microarray/shyamsundarEtAl
cd /projects/compbio/data/microarray/shyamsundarEtAl
wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptsetno_3130.tar.gz
wget ftp://smd-ftp.stanford.edu/smd/publications/426/3130/exptset_3130.meta
tar xfz exptsetno_3130.tar.gz
rm exptsetno_3130.tar.gz
mkdir spots
cat << _EOF_ > cleanXls.awk
{ 
  if (/^!/) 
     {
     line = \$0
     gsub(/\"|,/, "", line)
     print line
     }
   else
     print
}
_EOF_
for file in *.xls; do
   awk -f cleanXls.awk $file > spots/$file
done
cd ~/kent/src/hg/makeDb/hgMedianMicroarray
# The hgFixed.history doesn't have the errata column
echo alter table history add column errata varchar(255) | hgsql hgFixed
hgStanfordMicroarray -dataField="Normalized Ch2 Intensity (Median)" \
  hgFixed humanNormal humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
hgStanfordMicroarray -dataField="Log(base2) of R/G Normalized Ratio (Mean)" \
  hgFixed humanNormalRatio humanNormalExps /projects/compbio/data/microarray/shyamsundarEtAl/spots
echo "select name from humanNormalExps" | hgsql -N hgFixed | awk "{print \"\'\"\$0\"\'\"}" > col1
echo "select id from humanNormalExps" | hgsql -N hgFixed > col2
n=`wc -l < col1`
for i in `seq 1 $n`; do echo "n/a" >> col1.5; done
paste col1 col1.5 col2 | sort | tr '\t' ' ' > humanNormal.ra
rm col1 col1.5 col2
# EDIT humanNormal.ra by hand and combine the like tissues
hgMedianMicroarray -minExps=1 hgFixed humanNormal humanNormalExps humanNormal.ra \
  humanNormalMedian humanNormalMedianExps
hgMedianMicroarray -minExps=1 hgFixed humanNormalRatio humanNormalExps humanNormal.ra \
  humanNormalMedianRatio humanNormalMedianExps
#### HUMAN NORMAL DATA FIXING (10/5/2006 Andy)
ssh hgwdev
cd /projects/compbio/data/microarray/shyamsundarEtAl
mv spots/13729.xls .
tail +23 13729.xls | cut -f8,63 > data.txt
echo 13729 > arrays.txt
for array in spots/*; do 
    echo $array >> arrays.txt
    tail +23 $array | cut -f63 > newCol.txt
    paste data.txt newCol.txt > tmp.txt
    mv tmp.txt data.txt 
done
sed '/^[[:space:]]/d' data.txt > tmp.txt
mv tmp.txt data.txt
sed 's/spots\///;s/\.xls.*$//' arrays.txt > tmp.txt
mv tmp.txt arrays.txt
for id in `cat arrays.txt`; do grep $id -B1 exptset_3130.meta | grep Name | sed 's/.*=//;s/\"//g' >> names.txt; done
paste arrays.txt names.txt | sort -k2,2 > tmp.txt
mv tmp.txt arrays.txt
rm names.txt
# I changed my mind 

echo "" | cat - names.txt | tr '\n' '\t' > oneLine.txt
cat oneLine.txt data.txt > tmp.txt
mv tmp.txt data.

# (copy/paste this into columnDb.ra)

# Mouse data from Zhang, et. al The functional landscape of mouse gene expression" J Biol. 
# http://hugheslab.med.utoronto.ca/Zhang/
mkdir -p /cluster/store2/microarray
ln -s /cluster/store2/microarray /cluster/data/microarray
mkdir -p /cluster/data/microarray/zhangEtAl
cd /cluster/data/microarray/zhangEtAl
wget http://hugheslab.med.utoronto.ca/Zhang/expression_39309_normalized.txt
sed 's/\(XM_[0-9]\+\)\.1/\1/' expression_39309_normalized.txt > arrays.txt
hgGenericMicroarray hgFixed mouseLandscape arrays.txt
wget http://hugheslab.med.utoronto.ca/Zhang/mouse_XM_mRNA_NCBI_2.fa
sed 's/^>.*|\(XM.*\)\.1|.*$/>\1/' mouse_XM_mRNA_NCBI_2.fa > xm.fa
ssh kk9
cd /santest/scratch
mkdir andy
cd andy/
cp /cluster/data/microarray/zhangEtAl/xm.fa .
ls -1 /panasas/store/mm6/nib/* | grep -v random > chroms.lst
cat << _EOF_ > gsub
#LOOP
blat -ooc=/scratch/hg/h/mouse11.ooc -fine -q=rna -noHead \$(path1) xm.fa xm.\$(root1).psl
#ENDLOOP
_EOF_
gensub2 chroms.lst single gsub spec
para create spec
para push
para time
#Completed: 22 of 22 jobs
#CPU time in finished jobs:      36298s     604.96m    10.08h    0.42d  0.001 y
#IO & Wait Time:                    91s       1.52m     0.03h    0.00d  0.000 y
#Average job time:                1654s      27.57m     0.46h    0.02d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:            2955s      49.25m     0.82h    0.03d
#Submission to last job:          2957s      49.28m     0.82h    0.03d
cat *.psl > xm.psl
ssh hgwdev
cd /cluster/data/microarray/zhangEtAl
cp /santest/scratch/andy/xm.psl .
hgLoadPsl -table=xmMrna mm6 xm.psl
hgMapToGene -type=psl -cds mm6 xmMrna knownGene knownToXM
echo drop table xmMrna | hgsql mm6 

# REBASE 505 (4-28-2005)  (Done 5/18/2005 Andy)
   ssh hgwdev
   # download files
   curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
   curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
   # References file
   tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
   tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
   paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
   rm c1 c2
   # Load the cutters table.
   hgCutters hgFixed rebase.gcg
   # Load the other table.
   hgsql hgFixed -e "echo delete from rebaseRefs"
   hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"
   
# REBASE 603 (3-1-2006)  (Done 3-2-2006 Andy)
   ssh hgwdev
   # download files
   curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgenz > rebase.gcg
   curl -A "Mozilla/4.0" http://rebase.neb.com/rebase/link_gcgref > rebaseRefs.txt
   # References file
   tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f2- -d' ' > c2
   tail +15 rebaseRefs.txt | sed 's/ \+/ /g; s/^ //; /^$/d' | cut -f1 -d' ' | sed 's/\.//' > c1
   paste c1 c2 | sed '/^$/d' > rebaseRefs.txt
   rm c1 c2
   # Load the cutters table.
   hgCutters hgFixed rebase.gcg
   # Load the other table.
   hgsql hgFixed -e "echo delete from rebaseRefs"
   hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"   

# REBASE 902 (2009) (DONE 2009-02-09, Andy)
   ssh hgwdev
   mkdir /hive/data/outside/rebase
   cd /hive/data/outside/rebase
   tail -n+15 rebaseRefs.txt | sed '/^$/d; s/^\s\+\([[:digit:]]\+\)\.\s\+\</\1\t/' > tmp
   mv tmp rebaseRefs.txt
   hgCutters hgFixed rebase.gcg
   hgsql hgFixed -e "delete from rebaseRefs"
   hgsql hgFixed -e "load data local infile 'rebaseRefs.txt' into table rebaseRefs"   
   mkdir rebase902
   mv rebase.gcg rebaseRefs.txt rebase902/

# GLADSTONE hESC Novartis microarray data.
   # 1. Download http://www.genmapp.org/temp/humansimpleESC.zip
   # 2. Manually convert using MS access on Bob's laptop to a tab-delimited text file.
   # 3. Add column names to that file manually.
   ssh hgwdev
   mkdir /projects/compbio/data/microarray/gladstone
   cp hESC.txt /projects/compbio/data/microarray/gladstone
   cd /projects/compbio/data/microarray/gladstone
   cut -f1,4,6 hESC.txt | tail +2 | sort -k3,3 -k1,1 > data.1
   for tiss in `cut -f3 data.1 | sort | uniq`; do 
      grep $tiss data.1 | cut -f1,2 | sort -k1,1 | cut -f2 > data.${tiss}.1
      echo $tiss | cat - data.${tiss}.1 > data.${tiss}.2
   done
   paste data.*.2 > data.2
   grep Lung data.1 | cut -f1 | sort > names
   echo Probe | cat - names | paste - data.2 > data.3
   cat << _EOF_ > fixGladstone.sed
s/_/ /;
s/Embryonicstemcell/Embryonic Stem Cell/;
s/Smoothmuscle/Smooth Muscle/;
s/Salivarygland/Salivary Gland/;
s/Lymphnode/Lymph Node/;
s/Bonemarrow/Bone Marrow/;
s/Spinalcord/Spinal Cord/;
s/Wholebrain/Whole Brain/;
s/blood/Blood/;
_EOF_
   head -n1 data.3 | sed -f fixGladstone.sed > header
   tail +2 data.3 | cat header - > data.4
   mv data.4 generic.hESC.txt
   rm data.* names header
   hgGenericMicroarray hgFixed gladHumES generic.hESC.txt
   hgRatioMicroarray gladHumES gladHumESRatio

# GLADSTONE 
   ssh hgwdev
   cd /projects/compbio/data/microarray/gladstone
   awk '{if ($3 == $4) print}' hESC.txt > bestQ.hESC.txt
   cat << _EOF_ | hgsql hgFixed
CREATE TABLE gladHumESOtherData (
    name varchar(255) not null,    # Name of item
    tissueQ varchar(255) not null,  # Name of Q-associated tissue
    qVal float not null, # Q value
    hVal float not null, # H value
      #Indices
    INDEX(name(8)),
    INDEX(tissueQ(10))
);
_EOF_
   cut -f1,2,5,6 hESC.txt | tail +2 | sort -k1,1 -k3,3n \
     | awk '{printf("%s\t%s\t%s\t%s\n", $4, $3, $2, $1)}' \
     | uniq -f3 \
     | awk '{printf("%s\t%s\t%s\t%s\n", $4, $1, $2, $3)}' \
     > gladOther.txt
   # Fix up the tissue column
   cut -f2 gladOther.txt > tmp.tiss.1
   sed -f fixGladstone.sed tmp.tiss.1 > tmp.tiss.2
   cut -f1 gladOther.txt > tmp.names
   cut -f3- gladOther.txt | paste tmp.names tmp.tiss.2 - \
     > tmp.glad
   mv tmp.glad gladOther.txt
   rm tmp.*
   echo "load data local infile 'gladOther.txt' into table gladHumESOtherData" | hgsql hgFixed

# PRINCETON STEM CELL ARRAYS
   ssh hgwdev
   mkdir /projects/compbio/data/microarray/princetonESC
   cd /projects/compbio/data/microarray/princetonESC
   for num in i ii iii iv v vi vii; do
      wget http://stemcell.princeton.edu/affy_cluster_${num}.html
      grep "td bgcolor=\"#FFFFAA\" align=center class=ssb" affy_cluster_${num}.html | sed 's/.*<p>\(.*\)<\/td>/\1/' > names
      grep "<td class=fixed align=right>" affy_cluster_${num}.html | sed 's/.*right>\(.*\)&nbsp;<\/td>.*$/\1/' | colify 9 /dev/stdin > data
      paste names data >> tmp.txt
      rm names data affy_cluster_${num}.html
   done
   echo "~Bone Marrow RhoLo~Bone Marrow RhoHi~Bone Marrow Sca-~Bone Marrow Lin+~Fetal Liver Sca+~Fetal Liver Sca-~Fetal Liver Lin+~Neural Stem Cells~Embryonic Stem Cells" | tr '~' '\t' | cat - tmp.txt > princeton.txt
   rm tmp.txt

# QA push cghNci60Exps on 2006-02-07 to rr. Table/data previously missing (Jen)
# QA re-push rosChr22Dat on 2006-02-08 to fix table formatting/timestamps (Jen)

# AFFY ALL EXON HUMAN ARRAYS (INCLUDES TABLES ON HG17 AND HG18) (Done 3/15/2006, Andy)
     # Chuck put them in tab-delimited file in ~sugnet
   ssh hgwdev
   cd /projects/compbio/data/microarray
   mkdir affyHumanExon
   cd affyHumanExon/
   cp ~sugnet/plier-gcbg-sketch.summary.txt .
   sed -e "s/huex_wta_//g" -e "s/\.CEL//g" plier-gcbg-sketch.summary.txt > data.txt
   hgGenericMicroarray hgFixed affyHumanExon data.txt
     # Chuck put probe data into two tables in hg17.
     # Grab the bed first.  Change the original name because a lot got started
     # without keeping Chuck's naming convention in mind.  oh well.
   hgsql hg17 -e "rename table affyHuEx1 to affyHumanExonProbes"
   hgsql hg17 -e "rename table affyHuEx1Annot to affyHumanExonProbeAnnot"
   hgsql hg17 -e "select * from affyHuEx1" | tail +2 | cut -f2-7 | > hg17.probes.bed
     # Lift to hg18
   liftOver hg17.probes.bed /gbdb/hg17/liftOver/hg17ToHg18.over.chain hg18.probes.bed hg18.unMapped
     # How many didn't get lifted (out of 1.4 million)? 
   wc -l hg18.unMapped
#    276 hg18.unMapped
     # That's not bad at all. 99.99% of them lifted fine.
     # Load the hg18 probe bed.  Change the name of the hg17 one.
   hgLoadBed hg18 affyHumanExonProbes hg18.probes.bed
     # Deal with that extra annotation table of Chuck's.  I made a new autosql 
     # which almost matches it except for the name/probeSet fields.
     # First copy it out of hg17 and into a file with the new column order.
   hgsql hg17 -e "select probesetId,numIndependentProbes,exonClustId,numNonOverlapProbes,probeCount,transcriptClustId,probesetType,numXHybeProbe,psrId,level,evidence,bounded,cds from affyHumanExonProbeAnnot" \
   | tail +2 > annot.tab
     # Load that into hgFixed and change the name.
   hgLoadSqlTab hgFixed affyAllExonProbe ~/kent/src/hg/lib/affyAllExonProbe.sql annot.tab
   hgsql hgFixed -e "rename table affyAllExonProbe to affyHumanExonProbeAnnot"
     # Make ratio table for the microarray
   hgRatioMicroarray affyHumanExon affyHumanExonRatio
     # Merge probe beds with array data and load those beds.
   bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
   bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
   hgLoadBed hg17 affyHumanExon hg17.bed
   hgLoadBed hg18 affyHumanExon hg18.bed
     # Create human-level trackDb entry and affyHumanExon.html
     # and check into cvs.

###### AFFY HUMAN EXONS (COMPLETE DATA) (DONE 7-21-2006, Andy)
    ssh hgwdev
    cd /projects/compbio/data/microarray/affyHumanExon/
    mkdir moreData
    cd moreData/
    ssh bark
    cd /scratch
    cp forAndy/* /projects/compbio/data/microarray/affyHumanExon/moreData
    exit
    sed -e "s/huex_wta_//g" -e "s/\.CEL//g" exonData.vs.tab > data.txt
    hgGenericMicroarray hgFixed affyHumanExon data.txt
    hgsql hgFixed -e "select * from affyHumanExonExps" | sed "/^\+/d" | tail +2 | sed "s/_.,/,/" > newExps.tab
    hgsql hgFixed -e "delete from affyHumanExonExps"
    hgsql hgFixed -e "load data local infile 'newExps.tab' into table affyHumanExonExps"
    cd ~/kent/src/hg/makeDb/hgRatioMicroarray/
    # Make file affyHumanExon.ra in the medSpec style.
    hgRatioMicroarray -minAbsVal=0 -clump=affyHumanExon.ra affyHumanExon affyHumanExonRatio    
    bedMergeExpData hgFixed.affyHumanExonRatio hg17.affyHumanExonProbes hg17.bed
    bedMergeExpData hgFixed.affyHumanExonRatio hg18.affyHumanExonProbes hg18.bed
    hgLoadBed hg17 affyHumanExon hg17.bed
    hgLoadBed hg18 affyHumanExon hg18.bed
# Copied affyHumanExon to hg16 (DONE 10-12-2006, Andy)
    cd /cluster/data/hg16/bed/
    mkdir affyHumanExon
    cd affyHumanExon/
    echo "select name,expCount,expScores from affyHumanExon" | hgsql hg17 | tail +2 > expdata.tab
    cp ~/kent/src/hg/lib/expData.sql .
    hgLoadSqlTab hgFixed expData expData.sql expdata.tab
    bedMergeExpData hgFixed.expData hg16.affyHuEx1 hg16.bed
    hgLoadBed hg16 affyHumanExon hg16.bed 
    hgsql -e 'drop table expData' hgFixed

# QA push new cutters and rebaseRefs tables (04-06-2006: ASZ).

### load ncbi taxonomy tables (04-11-2006: Robert).

mkdir /cluster/store5/taxonomy
cd /cluster/store5/taxonomy
ln /cluster/store5/taxonomy /cluster/data/taxonomy -s
wget ftp://ftp.taxon.nih.gov/pub/taxonomy/taxdump.tar.gz
tar xvfz taxdump.tar.gz 
sed -e 's/\t|\t/~/g' names.dmp |sed -e 's/\t|//g' |awk -F~ 'length($3)<2{OFS="\t";print $2,$1,$4}length($3)>=2{OFS="\t";print $3,$1,$4}' > taxonName.txt 
sed -e 's/\t|\t/~/g' division.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonDivision.txt 
sed -e 's/\t|\t/~/g' gencode.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > taxonGencode.txt 
sed -e 's/\t|\t/~/g' nodes.dmp |sed -e 's/\t|//g' |awk -F~ '{OFS="\t";$1=$1;print $0}' > ncbiNode.txt
pushd ~/kent/src/hg/lib
autoSql taxonNode.as taxonNode -dbLink
autoSql taxonXref.as taxonXref -dbLink
autoSql taxonName.as taxonName -dbLink
autoSql taxonGeneticCode.as taxonGeneticCode -dbLink
autoSql taxonDivision.as taxonDivision -dbLink
mv taxon*.h ../inc
make
#edit .sql files to add indexes

hgsql hgFixed < taxonName.sql
hgsql hgFixed < taxonNode.sql
hgsql hgFixed < taxonDivision.sql
hgsql hgFixed < taxonGeneticCode.sql
popd

hgsql hgFixed -e "load data local infile 'taxonName.txt' into table taxonName;"
hgsql hgFixed -e "load data local infile 'taxonNode.txt' into table taxonNode" 
hgsql hgFixed -e "load data local infile 'taxonDivision.txt' into table taxonDivision;"
hgsql hgFixed -e "load data local infile 'taxonGencode.txt' into table taxonGeneticCode;"

echo "select o.name, n.taxon as ncbi_taxon, n.name , toGenus from sp060115.taxon t, hgFixed.taxonName n, organism o where o.name = n.name and n.taxon = t.id order by toGenus;" | hgsql hg17 -N -B > taxonXref.txt
hgsql hgFixed -e "load data local infile 'taxonXref.txt' into table taxonXref;"

#--**************************************************************************
#--  This is the NCBI genetic code table
#--  Initial base data set from Andrzej Elzanowski while at PIR International
#--  Addition of Eubacterial and Alternative Yeast by J.Ostell at NCBI
#--  Base 1-3 of each codon have been added as comments to facilitate
#--    readability at the suggestion of Peter Rice, EMBL
#--  Later additions by Taxonomy Group staff at NCBI
#--
#--  Version 3.9
#--     Code 14 differs from code 9 only by translating UAA to Tyr rather than
#--     STOP.  A recent study (Telford et al, 2000) has found no evidence that
#--     the codon UAA codes for Tyr in the flatworms, but other opinions exist.
#--     There are very few GenBank records that are translated with code 14,
#--     but a test translation shows that retranslating these records with code
#--     9 can cause premature terminations.  Therefore, GenBank will maintain
#--     code 14 until further information becomes available.
#--
#--  Version 3.8
#--     Added GTG start to Echinoderm mitochondrial code, code 9
#--
#--  Version 3.7
#--     Added code 23 Thraustochytrium mitochondrial code
#--        formerly OGMP code 93
#--        submitted by Gertraude Berger, Ph.D.
#--
#--  Version 3.6
#--     Added code 22 TAG-Leu, TCA-stop
#--        found in mitochondrial DNA of Scenedesmus obliquus
#--        submitted by Gertraude Berger, Ph.D.
#--        Organelle Genome Megasequencing Program, Univ Montreal
#--
#--  Version 3.5
#--     Added code 21, Trematode Mitochondrial
#--       (as deduced from: Garey & Wolstenholme,1989; Ohama et al, 1990)
#--     Added code 16, Chlorophycean Mitochondrial
#--       (TAG can translated to Leucine instaed to STOP in chlorophyceans
#--        and fungi)
#--
#--  Version 3.4
#--     Added CTG,TTG as allowed alternate start codons in Standard code.
#--        Prats et al. 1989, Hann et al. 1992
#--
#--  Version 3.3 - 10/13/95
#--     Added alternate intiation codon ATC to code 5
#--        based on complete mitochondrial genome of honeybee
#--        Crozier and Crozier (1993)
#--
#--  Version 3.2 - 6/24/95
#--  Code       Comments
#--   10        Alternative Ciliate Macronuclear renamed to Euplotid Macro...
#--   15        Bleharisma Macro.. code added
#--    5        Invertebrate Mito.. GTG allowed as alternate initiator
#--   11        Eubacterial renamed to Bacterial as most alternate starts
#--               have been found in Achea
#--
#--
#--  Version 3.1 - 1995
#--  Updated as per Andrzej Elzanowski at NCBI
#--     Complete documentation in NCBI toolkit documentation
#--  Note: 2 genetic codes have been deleted
#--
#--   Old id   Use id     - Notes
#--
#--   id 7      id 4      - Kinetoplast code now merged in code id 4
#--   id 8      id 1      - all plant chloroplast differences due to RNA edit
#--
#--*************************************************************************
#
#Genetic-code-table ::= {
# {
#  name "Standard" ,
#  name "SGC0" ,
#  id 1 ,
#  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "---M---------------M---------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Vertebrate Mitochondrial" ,
#  name "SGC1" ,
#  id 2 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG",
#  sncbieaa "--------------------------------MMMM---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Yeast Mitochondrial" ,
#  name "SGC2" ,
#  id 3 ,
#  ncbieaa  "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "----------------------------------MM----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#    name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate
# Mitochondrial; Mycoplasma; Spiroplasma" ,
#  name "SGC3" ,
#  id 4 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "--MM---------------M------------MMMM---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Invertebrate Mitochondrial" ,
#  name "SGC4" ,
##  id 5 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG",
#  sncbieaa "---M----------------------------MMMM---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" ,
#  name "SGC5" ,
#  id 6 ,
#  ncbieaa  "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Echinoderm Mitochondrial; Flatworm Mitochondrial" ,
#  name "SGC8" ,
#  id 9 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Euplotid Nuclear" ,
#  name "SGC9" ,
#  id 10 ,
#  ncbieaa  "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Bacterial and Plant Plastid" ,
#  id 11 ,
#  ncbieaa  "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "---M---------------M------------MMMM---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Alternative Yeast Nuclear" ,
#  id 12 ,
#  ncbieaa  "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-------------------M---------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Ascidian Mitochondrial" ,
#  id 13 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG",
#  sncbieaa "---M------------------------------MM---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# },
# {
#  name "Alternative Flatworm Mitochondrial" ,
#  id 14 ,
#  ncbieaa  "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
#  name "Blepharisma Macronuclear" ,
#  id 15 ,
#  ncbieaa  "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
#  name "Chlorophycean Mitochondrial" ,
#  id 16 ,
#  ncbieaa  "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
#  name "Trematode Mitochondrial" ,
#  id 21 ,
#  ncbieaa  "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
#  name "Scenedesmus obliquus Mitochondrial" ,
#  id 22 ,
#  ncbieaa  "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "-----------------------------------M----------------------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# } ,
# {
#  name "Thraustochytrium Mitochondrial" ,
#  id 23 ,
#  ncbieaa  "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG",
#  sncbieaa "--------------------------------M--M---------------M------------"
#  -- Base1  TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG
#  -- Base2  TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG
#  -- Base3  TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG
# }
#}

##########################################################################
# Added Zebrafish microarray data (DONE, 2006-06-10, hartera)
# From Leonard Zon's group at the Children's Hospital, Boston
# Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
# Data is normalized and log2 transformed, then centered on mean of 0.
# Changed table names and reloaded MedianExps table so that the extras has
# the strain plus time point for the name otherwise the average is taken
# over all time points for a strain for the track display 
# when Tissue Averages is selected. (DONE, 2006-07-30, hartera) 
# Changed so that the extras column for the MedianExps table has the 
# developmental stage so that an average is taken across all strains for 
# each stage when Tissue Averages is selected.
# (Jim recommended displaying it this way and then it also fits in with the
# current framework for this type of track).
# Also added the strain name and stage to the extra column for the 
# Experiments tables (AllExps and MedianExps) so that when Chip ID is 
# selected then all of these are shown. (DONE, 2006-08-11, hartera)
# Added absolute data (before logs were taken). (DONE, 2006-09-19, hartera)
# The absolute value data was centered on a mean of 0. The log data was
# the log2 transformed normalized data, centered on a mean of 0.
# This section now OBSOLETE so removed. See section below on UPDATE of
# zebrafish microarray data. 

##########################################################################
# UPDATE the Zebrafish microarray data (DONE, 2006-06-16 - 2006-10-18, hartera)
# From Leonard Zon's group at the Children's Hospital, Boston
# Contact: Tony DiBiase, adibiase@enders.tch.harvard.edu
# Data is Loess normalized absolute values. Then use microarray processing
# programs to create ratio tables.
# The new data set was obtained so that the ratios could be calculated 
# directly from the normalized absolute data. The ratios are calculated as 
# the value for a probeset in one array to the median value across all arrays
# for that probset and then a log2 is taken. This allows comparison 
# between arrays that may differ due to technical or biological differences.
# RE-CREATE tables. Data was log2 already so antilog the values to get 
# absolute values and then pass through the microarray processing programs.
# (DONE, 2007-01-05 - 2007-01-08, hartera)
   ssh hgwdev
   mkdir /projects/compbio/data/microarray/zebrafishWT
   cd /projects/compbio/data/microarray/zebrafishWT
   # copy the data here received by e-mail and unzip
   unzip wt34.loessNorm.absval.2006-10-12.zip 
   mv wt34.loessNorm.absval.2006-10-12.txt wtAffyNormLog2.txt
   dos2unix wtAffyNormLog2.txt 
cat << 'EOF' > format.pl
#!/usr/bin/perl -w
use strict;

while (<STDIN>)
{
# reformat file. change Tu to TU and remove experiment name from the
# column headings and translate the name to something human readable.
my ($f, @a, $n, $strain, $somites, $hpf, $fullName);
$f = $_;
if ($f !~ /at/)
   {
   @a = split(/\t/, $f);
   foreach $n (@a)
      {
      $fullName = "";
      $somites = 0;
      $hpf = 0;
      if ($n =~ /^([A-Za-z]+)\.([0-9]+)\.([0-9]+)\.[0-9]+\.[0-9]+/)
         {
         $strain = $1;
         $somites = $2;
         $hpf = $3;
         $strain =~ s/Tu/TU/;
         if ($somites > 0)
            {
            $fullName = $strain . "-" . $somites . "-somites";
            }
         elsif ($hpf > 0) 
            {
            $fullName = $strain . "-" . $hpf . "-hpf";
            }       
         print "\t$fullName";
         }
       }
     print "\n";
    }
else
   {
   print $f;
   }
}
'EOF'
   # << emacs
chmod +x format.pl
perl format.pl < wtAffyNormLog2.txt > zebrafishWTNormLog2.txt
# antilog the values, log is base 2
cat << 'EOF' > cnvToAntilog
#!/usr/bin/awk -f
BEGIN {
    FS = "\t"
    RS = "\n"
    ORS=""
}
{
    print $1 "\t"
    x=2
    while (x < NF) {
        print 2^$x "\t"
        x++
    }
    print 2^$NF "\n"
}
'EOF'
chmod +x cnvToAntilog
# run script and skip header line in file
tail +2 zebrafishWTNormLog2.txt | cnvToAntiLog > tmp.txt
# add back header line:
head -1 zebrafishWTNormLog2.txt > header
cat header tmp.txt > zebrafishWTNormAbs.txt

# Then load the data into hgFixed using hgGnfMicroarrray and use options
# to set the url, ref, and credit to "n/a" and chip to Zebrafish.
# Need to use this program to get 3 extras needed for hgMedianMicroarray
# No need to round the values this time as they are larger and have 
# a larger range.
# Create the main expRecord table and the expData table for the 
# absolute measurements
hgGnfMicroarray zebrafishZonWTAllExps zebrafishZonWTAll \
     zebrafishWTNormAbs.txt -chip=Zebrafish -url=n/a -ref=n/a -credit=n/a 

# Changed the Exps table so that the extras column for the MedianExps table 
# has the strain and developmental stage in the second field so that an 
# average is taken across all strains for each stage when Tissue Averages 
# is selected.
# (Jim recommended displaying it this way and then it also fits in with the
# current framework for this type of track).

hgsql -N -e 'select name, extras from zebrafishZonWTAllExps;' hgFixed \
      > zfishWTExps.extras

cat << 'EOF' > cnvExtras.pl
#!/usr/bin/perl -w
use strict;

while (<STDIN>) {
my ($line, @extras);
$line = $_;
@extras = split(/,/, $line);
$line =~ s/n\/a/$extras[2]/;
print $line;
}
'EOF'
chmod +x cnvExtras.pl
cnvExtras.pl < zfishWTExps.extras > zfishWTExps.extras.new 
# create set of mySQL statements from this to update the AllExps table
# to include the name in the second field of extras - same as in the third
# field. This is used for display when the "Arrays Grouped By Replicate
# Medians" (or Means) is selected from the track controls on the 
# description page.

awk 'BEGIN {FS = "\t"} {print "update zebrafishZonWTAllExps set extras = \"
"$2 "\"" " where name = \""$1"\";";}' zfishWTExps.extras.new \
    > zfishWTExpsNewExtras.sql
hgsql hgFixed < zfishWTExpsNewExtras.sql

# Convert these to ratios using the median of the absoulute values 
# across all experiments to be the denominator for each probeset.
# minAbsVal is 0 here as no value in this dataset is less than 1 and the
# default for this parameter is 20.
hgRatioMicroarray -minAbsVal=0 zebrafishZonWTAll zebrafishZonWTAllRatio 

# Create the .ra file for the Median tables
hgsql -N -e 'select extras, id from zebrafishZonWTAllExps;' hgFixed \
      > zfishWTExps
# remove extra information and leave experiment name
perl -pi.bak -e 's/Zebrafish,[A-Za-z]+\-[0-9]+\-[a-z]+,//' zfishWTExps
perl -pi.bak -e 's/,//' zfishWTExps

# alter script so that name for each experiment in column 2 is not just the 
# strain but the strain plus time point (same as first column). This goes into
# the extras column for zebrafishWTMedianExps and is used for Tissue Averages
# display for the array data track. Otherwise an average is taken for the 
# strain (hartera, 2006-07-30).
# change so that column 2 is the time point so that an average of time points
# is taken for the "Tissue Averages" Display (hartera, 2006-08-11)
cat << 'EOF' > cnvToMedian
#!/usr/bin/awk -f

BEGIN {
    FS = "\t";
    OFS = "\t";
}

{
    data[$1] = data[$1] " " $2;
}

END {
    for (id in data) {
        split(id, a, "\\-");
        print id, a[2]a[3], substr(data[id], 2);
    }
}
'EOF'
   # << emacs
chmod +x cnvToMedian
cnvToMedian zfishWTExps > zfishZonWTMedian.ra
# re-order the *.ra file as this determines the order of display
sort zfishZonWTMedian.ra | grep "14somites" > tmp.ra
sort zfishZonWTMedian.ra | grep "15somites" >> tmp.ra
sort zfishZonWTMedian.ra | grep "hpf" >> tmp.ra
mv tmp.ra zfishZonWTMedian.ra
cp zfishZonWTMedian.ra ~/kent/src/hg/makeDb/hgMedianMicroarray
cd ~/kent/src/hg/makeDb/hgMedianMicroarray

# Take the median value over multiple replicants and put in this table:
hgMedianMicroarray hgFixed zebrafishZonWTAllRatio zebrafishZonWTAllExps \
  zfishZonWTMedian.ra zebrafishZonWTMedianRatio  \
  zebrafishZonWTMedianExps -minExps=1

# Make a median version of the absolute experiments:
hgMedianMicroarray hgFixed zebrafishZonWTAll zebrafishZonWTAllExps \
  zfishZonWTMedian.ra zebrafishZonWTMedian zebrafishZonWTMedianExps -minExps=1

# get distribution of MedianRatio scores:
hgsql -N -e 'select * from zebrafishZonWTMedianRatio;' hgFixed > medRatioData
awk '{print $3}' medRatioData > medRatioData2
perl -pi.bak -e 's/,/\n/g' medRatioData2
textHistogram -real -binSize=0.2 -maxBinCount=1100 -minVal=-200 \
    medRatioData2 > histMedRatio.out

# from this histogram, see that most values fall between -2 to +2 so set the
# trackDb for the Affy Zon Wild Type Array track to have expScale of 2.0
# and expStep to 0.2 for the log scale to display the ratios in this track.
 
##########################################################################

#The Mouse GNF Expression Atlas 2 (2004)
##########################################################################
# Updated gv* tables for the Locus Variants tracks 
# (Belinda Giardine Sept 2006)
# This track is now available for hg17 and hg18, only the gvPos table needs to
# be redone for each build unless new mutations are added.  This load changes
# the schema (strand, label for gvPos) and adds a new LSDB (BTKbase) and more
# sanity checks on all the data causing some mismapped variants to be
# discarded.

##########################################################################
# mgcMBLabValid - Load of Genbank accession that are in the Brent lab clone
# validation database.  This contains both human and mouse clones.  Since
# the Brent lab is no longer doing MGC validations, this set is fixed
# and shared by all mouse and human assemblies. (2006-10-26 markd)
    mkdir -p /cluster/data/genbank/data/download/mgcMBLab
    cd /cluster/data/genbank/data/download/mgcMBLab
    # save list of 41805 accessions received from brent lab as
    # mgcMBLabValid.2006-10-25.acc
    hgLoadSqlTab hgFixed mgcMBLabValid ~/compbio/genbank/kent/src/hg/lib/mgcMBLabValid.sql mgcMBLabValid.2006-10-25.acc
    gzip mgcMBLabValid.2006-10-25.acc

##########################################################################
# ZEBRAFISH DEVELOPMENTAL ARRAYS FROM GENOME INSTITUTE OF SINGAPORE (GIS) 
# Data from Article:
# Transcriptome Analysis of Zebrafish Embryogenesis Using Microarrays Mathavan
# S, Lee SGP, Mak A, Miller LD, Murthy KRK, et al. PLoS Genetics Vol. 1, No. 2,
# e29, pages 260-276 doi:10.1371/journal.pgen.0010029
# Contact: Sinnakaruppan Mathavan <mathavans@gis.a-star.edu.sg>
# Downloaded expression data from
# http://giscompute.gis.a-star.edu.sg/~govind/zebrafish/data_download.html
# after clicking on link to download largest dataset (12.9 MB):
# ene expression data showing the expression profile during different stages
# of zebrafish embryonic development for the genes selected from the array are
# presented (Compugen array). Each value represents an average performance of
# 2-4 replicates. GenBank id of the selected gene is given as the identifier.
# Total RNA from different stages of embryonic development, adult male and
# female were pooled in equal concentrations and used as reference RNA. The
# genes were annotated using Zebrafish Chip Annotation Database.

     ssh hgwdev
     mkdir -p /projects/compbio/data/microarray/zebrafishGISDev
     # Downloaded data and saved in Excel as a tab, separated text file:
     # PLOSGISData.txt
     # This file contains Genbank accessions and the expression values
     # which are log2 based. 

##########################################################################
# Belinda Giardine April 2007
# gv* tables: 
#	reload tables, additions and corrections, details in hg18 doc
