# Make directory
# :vim nowrap
ssh hgwdev
mkdir /cluster/data/hg18/bed/jkg
cd /cluster/data/hg18/bed/jkg

# Get refseq and genbank .ra and fa files somehow - either
# from Mark, or by running something like:
# /cluster/data/genbank/bin/x86_64/gbGetSeqs -get=ra -gbRoot=/cluster/data/genbank \
#        -native -db=hg18 refseq mrna refSeq.ra
# /cluster/data/genbank/bin/x86_64/gbGetSeqs -get=ra -gbRoot=/cluster/data/genbank \
#        -native -db=hg18 genbank mrna mrna.ra
# Then convert these into some tab separated files, which takes 4 seconds:
txReadRa mrna.ra refSeq.ra .

# Get sequence files. Takes 70 seconds
/cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 refseq pep refPep.fa
/cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 refseq mrna refSeq.fa
/cluster/data/genbank/bin/x86_64/gbGetSeqs -gbRoot=/cluster/data/genbank -native -db=hg18 genbank mrna mrna.fa

# Get some other info from the database.  Best to get it about
# the same time so it is synced with other data. Takes 4 seconds.
echo 'select acc,status from mgcFullStatus' | hgsql -N hg18 > mgcStatus.tab
echo 'select distinct name,sizePolyA from mrnaOrientInfo' | hgsql -N hg18 > sizePolyA.tab

# Get refSeq, mrna and intronEst tables as psl.  Ignore the
# stdin is empty messages, these are just from _random chroms
# with no annotations.  Takes 7 seconds.
#mkdir refSeq mrna est
#foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
#    hgGetAnn -noMatchOk hg18 refSeqAli $c refSeq/$c.psl
#    hgGetAnn -noMatchOk hg18 mrna $c mrna/$c.psl
#    hgGetAnn -noMatchOk hg18 intronEst $c est/$c.psl
#end

# Note this time, since there are blat changes not yet in the main pipeline
# do the following, which takes 41 seconds
mkdir est
pslSplitOnTarget /cluster/store10/markd/genbank/ncalign/results/hg18/refseq.psl refSeq
pslSplitOnTarget /cluster/store10/markd/genbank/ncalign/results/hg18/genbank.psl mrna
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    if (! -e refSeq/$c.psl) then
	  echo creating empty refSeq/$c.psl
          echo -n "" >refSeq/$c.psl
    endif
    if (! -e mrna/$c.psl) then
	  echo creating empty mrna/$c.psl
          echo -n "" >mrna/$c.psl
    endif
    hgGetAnn -noMatchOk hg18 intronEst $c est/$c.psl
    if (! -e est/$c.psl) then
	  echo creating empty est/$c.psl
          echo -n "" >est/$c.psl
    endif
end

# Get list of accessions that are associated with antibodies from database.
# This will be a good list but not 100% complete.  Cluster these to get
# four or five antibody heavy regions.  Later we'll weed out input that
# falls primarily in these regions, and, include the regions themselves
# as special genes.  Takes 40 seconds
txAbFragFind hg18 antibodyAccs
pslCat mrna/*.psl -nohead | weedLines -invert antibodyAccs stdin antibody.psl
clusterPsl -prefix=antibody.abV antibody.psl stdout | awk '$10 > 20' | cut -f 1-12 > antibody.bed

# Convert psls to bed, saving mapping info and weeding antibodies.  Takes 2.5 min
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    txPslToBed refSeq/$c.psl -noFixStrand -cds=cds.tab /cluster/data/hg18/hg18.2bit refSeq/$c.bed -unusual=refSeq/$c.unusual
    txPslToBed mrna/$c.psl -cds=cds.tab /cluster/data/hg18/hg18.2bit stdout -unusual=mrna/$c.unusual \
        | bedWeedOverlapping antibody.bed maxOverlap=0.5 stdin mrna/$c.bed
    txPslToBed est/$c.psl /cluster/data/hg18/hg18.2bit stdout \
        | bedWeedOverlapping antibody.bed maxOverlap=0.3 stdin est/$c.bed
end

# Create mrna splicing graphs.  Takes 10 seconds.
mkdir bedToGraph
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    txBedToGraph -prefix=$c. refSeq/$c.bed refSeq mrna/$c.bed mrna bedToGraph/$c.txg
end

# Create est splicing graphs.  Takes 6 minutes.
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    txBedToGraph -prefix=e$c. est/$c.bed est est/$c.txg
end

# Create an evidence weight file
cat > trim.weights <<end
refSeq  100
mrna    2
txOrtho 1
exoniphy 1
est 1
end

# Make evidence file for EST graph edges supported by at least 2 
# ests.  Takes about 30 seconds.
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    txgGoodEdges est/$c.txg  trim.weights 2 est est/$c.edges
end

# Setup mouse dir
mkdir mm8
cd mm8

# Get mouse mrna including ESTs.  Takes about three minutes
mkdir refSeq mrna est
foreach c (`echo 'select chrom from chromInfo' | hgsql -N mm8`)
    echo $c
    hgGetAnn -noMatchOk mm8 refSeqAli $c stdout | txPslToBed stdin /cluster/data/mm8/mm8.2bit refSeq/$c.bed 
    hgGetAnn -noMatchOk mm8 mrna $c stdout | txPslToBed stdin /cluster/data/mm8/mm8.2bit mrna/$c.bed
    hgGetAnn -noMatchOk mm8 intronEst $c stdout | txPslToBed stdin /cluster/data/mm8/mm8.2bit est/$c.bed
end
#ignore gripe about missing data in chrM

# Create mouse splicing graphs.  Takes a minute and a half.
rm -f mouse.txg
foreach c (`echo 'select chrom from chromInfo' | hgsql -N mm8`)
    echo $c
    txBedToGraph refSeq/$c.bed refSeq mrna/$c.bed mrna est/$c.bed est stdout >> mouse.txg
end


# Clean up all but final mouse.txg
rm -r est mrna refSeq

# Unpack chains and nets, apply synteny filter and split by chromosome
# Takes 5 minutes.  Make up phony empty nets for ones that are empty after
# synteny filter.
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.all.chain.gz | chainSplit chains stdin
zcat /cluster/data/hg18/bed/blastz.mm8/axtChain/hg18.mm8.net.gz | netFilter -syn stdin | netSplit stdin nets
cd nets
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    if (! -e $c.net) then
	echo making phony $c.net
        echo -n > $c.net
    endif
end
cd ../..

# Make txOrtho directory and a para spec file
mkdir txOrtho
cd txOrtho
mkdir edges
cd ../bedToGraph
echo "#\!/bin/tcsh -ef" > ../txOrtho/spec
foreach f (*.txg)
    set c=$f:r
    echo txOrtho ../bedToGraph/$f ../mm8/chains/$c.chain ../mm8/nets/$c.net ../mm8/mouse.txg edges/$c.edges >> ../txOrtho/spec
end
cd ..

# Do txOrtho parasol run on iServer (high RAM) cluster
ssh kki "cd /cluster/data/hg18/bed/jkg/txOrtho; para make spec; para time"
#Completed: 49 of 49 jobs
#CPU time in finished jobs:       1916s      31.93m     0.53h    0.02d  0.000 y
#IO & Wait Time:                   540s       9.00m     0.15h    0.01d  0.000 y
#Average job time:                  50s       0.84m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             235s       3.92m     0.07h    0.00d
#Submission to last job:           235s       3.92m     0.07h    0.00d

# Filter out some duplicate edges. These are legitimate from txOrtho's point
# of view, since they represent two different mouse edges both supporting
# a human edge. However, from the human point of view we want only one
# support from mouse orthology.  Just takes a second.
cd txOrtho
mkdir uniqEdges
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    cut -f 1-9 edges/$c.edges | sort | uniq > uniqEdges/$c.edges
end
cd ..

# Clean up chains and nets since they are big
cd /cluster/data/hg18/bed/jkg
rm -r mm8/chains mm8/nets

# Get exonophy. Takes about 4 seconds.
echo "select chrom, txStart, txEnd, name, id, strand from exoniphy order by chrom, txStart;" \
	| hgsql -N hg18 > exoniphy.bed
bedToTxEdges exoniphy.bed exoniphy.edges

# Add evidence from ests, orthologous mouse transcripts, and exoniphy
# Takes 36 seconds.
mkdir graphWithEvidence
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    echo adding evidence for $c
    txgAddEvidence -chrom=$c bedToGraph/$c.txg exoniphy.edges exoniphy stdout \
       | txgAddEvidence stdin txOrtho/uniqEdges/$c.edges txOrtho stdout \
       | txgAddEvidence stdin est/$c.edges est graphWithEvidence/$c.txg
end

# Do  txWalk  - takes 32 seconds (mostly loading the mrnaSize.tab again and
# again...)
mkdir txWalk
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    txWalk graphWithEvidence/$c.txg trim.weights 3 txWalk/$c.bed -evidence=txWalk/$c.ev -sizes=mrnaSize.tab -defrag=0.25
end

# Make a file that lists the various categories of alt-splicing we see.
# Do this by making and analysing splicing graphs of just the transcripts
# that have passed our process so far.  The txgAnalyze program occassionally
# will make a duplicate, which is the reason for the sort/uniq run.
# Takes 7 seconds.
cat txWalk/*.bed | txBedToGraph stdin txWalk txWalk.txg
txgAnalyze txWalk.txg /cluster/data/hg18/hg18.2bit stdout | sort | uniq > altSplice.bed

# Get txWalk transcript sequences.  This'll take about 2 minutes
rm -f txWalk.fa
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    sequenceForBed -db=hg18 -bedIn=txWalk/$c.bed -fastaOut=stdout -upCase -keepName >> txWalk.fa
end
rm -rf txFaSplit
mkdir txFaSplit
faSplit sequence txWalk.fa 200 txFaSplit/

# Get parts of multiple alignments corresponding to transcripts.
# Takes 43 minutes.
echo hg18 panTro2 rheMac2 otoGar1 tupBel1 mm8 rn4 \
	cavPor2 oryCun1 canFam2 felCat1 equCab1 bosTau3 \
	dasNov1 loxAfr1 echTel1 monDom4 ornAna1 galGal3 \
	anoCar1 xenTro2 fr2 tetNig1 gasAcu1 oryLat1 danRer4 > allOrgs.txt
echo hg18 panTro2 rheMac2 mm8 canFam2 > ourOrgs.txt
foreach c (`echo 'select chrom from chromInfo' | hgsql -N hg18`)
    mafFrags hg18 multiz25way txWalk/$c.bed stdout -bed12 -orgs=allOrgs.txt \
       | mafSpeciesSubset stdin ourOrgs.txt txWalk/$c.maf -keepFirst
end

# Fold in antibody stuff at a stage where it *won't* be taken up by alignments
sequenceForBed -db=hg18 -bedIn=antibody.bed -fastaOut=stdout -upCase -keepName >> txWalk.fa
mv txWalk.fa abWalk.fa

# Set up cluster run Victor Solovyev's Best Orf program.
rm -rf bestorf
mkdir bestorf
mkdir bestorf/fa bestorf/tab
cd bestorf
ls -1 ../txFaSplit/*.fa > in.lst
cat << '_EOF_' > gsub
#LOOP
borfBig -exe=./borf $(path1) tab/$(root1).borf
#ENDLOOP
'_EOF_'
gensub2 in.lst single gsub spec
cat << '_EOF_' > borf
#!/bin/tcsh -ef
/cluster/bin/x86_64/bestorf /cluster/bin/x86_64/bestorf_hume.dat $1
'_EOF_'
chmod a+x borf
cd ..

# Do borf cluster run 
ssh pk "cd /cluster/data/hg18/bed/jkg/bestorf; para make spec; para time"
#Completed: 195 of 195 jobs
#CPU time in finished jobs:       1354s      22.56m     0.38h    0.02d  0.000 y
#IO & Wait Time:                  1129s      18.82m     0.31h    0.01d  0.000 y
#Average job time:                  13s       0.21m     0.00h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:              22s       0.37m     0.01h    0.00d
#Submission to last job:            22s       0.37m     0.01h    0.00d


# Fetch human protein set and table that describes if curated or not.
# Takes about a minute
hgsql -N sp070202 -e \
  'select p.acc, p.val from protein p, accToTaxon x where x.taxon=9606 and p.acc=x.acc'\
  |awk '{print ">" $1;print $2}' >uniProt.fa
hgsql -N sp070202 -e 'select i.acc,i.isCurated from info i,accToTaxon x where x.taxon=9606 and i.acc=x.acc' > uniCurated.tab

# Create blat dir - we'll run a couple of types of alignments here.
rm -rf blat
mkdir blat

# Set up blat jobs for proteins vs. translated txWalk transcripts
mkdir blat/protein
mkdir blat/protein/raw
cd txFaSplit
echo #\!/bin/tcsh -ef > ../blat/protein/spec
foreach f (*.fa)
    set c=$f:r
    echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../uniProt.fa raw/uni_$c.psl >> ../blat/protein/spec
    echo blat -t=dnax -q=prot -minIdentity=90 ../../txFaSplit/$f ../../refPep.fa raw/ref_$c.psl >> ../blat/protein/spec
end
cd ..

# Run protein/transcript blat job on cluster
ssh pk "cd /cluster/data/hg18/bed/jkg/blat/protein; para make spec; para time"
#CPU time in finished jobs:      13571s     226.18m     3.77h    0.16d  0.000 y
#IO & Wait Time:                  5645s      94.08m     1.57h    0.07d  0.000 y
#Average job time:                  49s       0.82m     0.01h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             137s       2.28m     0.04h    0.00d
#Submission to last job:           137s       2.28m     0.04h    0.00d


# Set up blat jobs for mrna vs. txWalk transcripts
mkdir blat/rna
mkdir blat/rna/raw
cd txFaSplit
echo #\!/bin/tcsh -ef > ../blat/rna/spec
foreach f (*.fa)
    set c=$f:r
    echo blat -ooc=/cluster/data/hg18/11.ooc -minIdentity=95 ../../txFaSplit/$f ../../mrna.fa raw/mrna_$c.psl >> ../blat/rna/spec
    echo blat -ooc=/cluster/data/hg18/11.ooc -minIdentity=97 ../../txFaSplit/$f ../../refSeq.fa raw/ref_$c.psl >> ../blat/rna/spec
end
cd ..

# Run rna/transcript blat on cluster.  This is a little i/o heavy, so use
# maxNode=50, or optimize i/o somehow.
ssh pk "cd /cluster/data/hg18/bed/jkg/blat/rna; para make -maxNode=50 spec; para time"
#Completed: 390 of 390 jobs
#CPU time in finished jobs:      19127s     318.78m     5.31h    0.22d  0.001 y
#IO & Wait Time:                 14795s     246.58m     4.11h    0.17d  0.000 y
#Average job time:                  87s       1.45m     0.02h    0.00d
#Longest running job:                0s       0.00m     0.00h    0.00d
#Longest finished job:             581s       9.68m     0.16h    0.01d
#Submission to last job:           843s      14.05m     0.23h    0.01d


# Sort and select best alignments. Remove raw files for space. Takes 22
# seconds. Use pslReps not pslCdnaFilter because need -noIntrons flag,
# and also working on protein as well as rna alignments. The thresholds
# for the proteins in particular are quite loose, which is ok because
# they will be weighted against each other.  We lose some of the refSeq
# mappings at tighter thresholds.
cd /cluster/data/hg18/bed/jkg/blat
pslCat -nohead rna/raw/ref*.psl | sort -k 10 | \
	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005 stdin rna/refSeq.psl /dev/null
pslCat -nohead rna/raw/mrna*.psl | sort -k 10 | \
	pslReps -noIntrons -nohead -minAli=0.90 -nearTop=0.005  stdin rna/mrna.psl /dev/null
pslCat -nohead protein/raw/ref*.psl | sort -k 10 | \
	pslReps -noIntrons -nohead -nearTop=0.02  -ignoreSize -minAli=0.85 stdin protein/refSeq.psl /dev/null
pslCat -nohead protein/raw/uni*.psl | sort -k 10 | \
	pslReps -noIntrons -nohead -nearTop=0.02  -minAli=0.85 stdin protein/uniProt.psl /dev/null
rm -r rna/raw protein/raw
cd ..

# Create evidence derived from mRNA CDS and mRNA/transcript alignments
# Takes 28 seconds.
rm -rf cdsEvidence
mkdir cdsEvidence
txCdsEvFromRna refSeq.fa cds.tab blat/rna/refSeq.psl abWalk.fa \
	cdsEvidence/refSeqTx.tce -refStatus=refSeqStatus.tab \
	-unmapped=cdsEvidence/refSeqTx.unmapped -exceptions=exceptions.tab
txCdsEvFromRna mrna.fa cds.tab blat/rna/mrna.psl abWalk.fa \
	cdsEvidence/mrnaTx.tce -mgcStatus=mgcStatus.tab \
	-unmapped=cdsEvidence/mrna.unmapped
txCdsEvFromProtein refPep.fa blat/protein/refSeq.psl abWalk.fa \
	cdsEvidence/refSeqProt.tce -refStatus=refPepStatus.tab \
	-unmapped=cdsEvidence/refSeqProt.unmapped \
	-exceptions=exceptions.tab -refToPep=refToPep.tab \
	-dodgeStop=3 -minCoverage=0.3
txCdsEvFromProtein uniProt.fa blat/protein/uniProt.psl abWalk.fa \
	cdsEvidence/uniProt.tce -uniStatus=uniCurated.tab \
	-unmapped=cdsEvidence/uniProt.unmapped -source=tremble
cat bestorf/tab/*.borf > all.borf
txCdsEvFromBorf all.borf abWalk.fa cdsEvidence/bestorf.tce

# Consolidate CDS and look for longest ORFs in other species.  Takes 40 seconds
cat cdsEvidence/*.tce | sort  > unweighted.tce
cut -f 1-3 unweighted.tce | uniq > cdsIntervals.tab
cat txWalk/*.maf | txCdsOrtho cdsIntervals.tab stdin cdsOrtho.tab

# Pick best CDS. Take 4 seconds
cp ~/kent/src/hg/txCds/txCdsPick/cds.weights .
cat txWalk/*.bed antibody.bed | \
	txCdsPick stdin unweighted.tce cds.weights pick.tce pick.picks \
		-weightedTce=weighted.tce -refToPep=refToPep.tab \
		-exceptionsIn=exceptions.tab \
		-exceptionsOut=txWalk.exceptions

# Create gene prediction (GTF) and peptide fasta file.
# Takes 7 seconds.
cat txWalk/*.bed antibody.bed | txCdsToGene stdin abWalk.fa pick.tce pick.gtf pick.fa \
	-bedOut=pick.bed -exceptions=txWalk.exceptions

# Create gene info table. Takes 8 seconds
cat mrna/*.unusual refSeq/*.unusual | awk '$5=="flip" {print $6;}' > all.flip
cat mrna/*.psl refSeq/*.psl | txInfoAssemble pick.bed pick.tce all.borf \
	altSplice.bed txWalk.exceptions sizePolyA.tab stdin all.flip prelim.info

# Cluster purely based on CDS (in same frame). Takes 1 second
txCdsCluster pick.bed pick.cluster

# Flag suspicious CDS regions, and add this to info file. Weed out bad CDS.
# Map CDS to gene set.  Takes 10 seconds
txCdsSuspect pick.bed txWalk.txg pick.cluster prelim.info pick.suspect pick.info -niceProt=pick.nice
txCdsWeed pick.tce pick.info cdsOrtho.tab weededCds.tce weededCds.info
cat txWalk/*.bed antibody.bed | txCdsToGene stdin abWalk.fa weededCds.tce weededCds.gtf weededCds.faa \
	-bedOut=weededCds.bed -exceptions=txWalk.exceptions

# Separate out transcripts into coding and 4 noncoding categories.
# Generate new gene set that weeds out the junkiest. Takes 9 seconds.
txGeneSeparateNoncoding weededCds.bed weededCds.info \
	coding.bed nearCoding.bed nearCodingJunk.bed antisense.bed noncoding.bed separated.info
awk '$2 != "nearCodingJunk"' separated.info > weeded.info
awk '$2 == "nearCodingJunk" {print $1}' separated.info > weeds.lst
cat coding.bed nearCoding.bed antisense.bed noncoding.bed > weeded.bed
txGeneFromBed weeded.bed pick.picks weeded.gp

# Generate data for SVM.
cat txWalk/*.bed | txCdsSvmInput stdin unweighted.tce cdsOrtho.tab separated.info svm.list svm.vector

# Assign permanent accessions to each transcript, and make up a number
# of our files with this accession in place of the temporary IDs we've been
# using.  Takes 4 seconds
txGeneAccession ../jkg.7/ucscGenes.bed ~kent/src/hg/txGene/txGeneAccession/txLastId \
	weeded.bed txToAcc.tab oldToNew.tab
subColumn 4 weeded.bed txToAcc.tab ucscGenes.bed
subColumn 1 weeded.info txToAcc.tab ucscGenes.info
weedLines weeds.lst pick.picks stdout | subColumn 1 stdin txToAcc.tab ucscGenes.picks
weedLines weeds.lst pick.nice stdout | subColumn 2 stdin txToAcc.tab ucscGenes.nice
subColumn 4 coding.bed txToAcc.tab ucscCoding.bed
subColumn 4 nearCoding.bed txToAcc.tab ucscNearCoding.bed
subColumn 4 antisense.bed txToAcc.tab ucscAntisense.bed
subColumn 4 noncoding.bed txToAcc.tab ucscNoncoding.bed
cat txWalk/*.ev | weedLines weeds.lst stdin stdout | subColumn 1 stdin txToAcc.tab ucscGenes.ev

# Cluster the coding and the noncoding sets, and make up canonical and
# isoforms tables. Takes 3 seconds.
txCdsCluster ucscCoding.bed coding.cluster
txBedToGraph ucscNoncoding.bed noncoding noncoding.txg -prefix=non
txBedToGraph ucscAntisense.bed antisense antisense.txg -prefix=anti
cat noncoding.txg antisense.txg > senseAnti.txg
txGeneCanonical coding.cluster ucscGenes.info senseAnti.txg ucscGenes.bed ucscNearCoding.bed \
	canonical.tab isoforms.tab txCluster.tab

#####################################################################################
# Start loading up the database!  Here we load into hg18a, which is a slimmed
# down copy of hg18.  Takes 2 seconds
hgLoadSqlTab hg18a knownCanonical ~/kent/src/hg/lib/knownCanonical.sql canonical.tab
hgLoadSqlTab hg18a knownIsoforms ~/kent/src/hg/lib/knownIsoforms.sql isoforms.tab

# Make files with protein and mrna accessions.  These will be taken from
# RefSeq for the RefSeq ones, and derived from our transcripts for the rest.
# Load these sequences into database. Takes 17 seconds.
txGeneProtAndRna weeded.bed separated.info abWalk.fa weeded.faa refSeq.fa \
    refToPep.tab refPep.fa txToAcc.tab ucscGenes.fa ucscGenes.faa
hgPepPred hg18a generic knownGenePep ucscGenes.faa
hgPepPred hg18a generic knownGeneMrna ucscGenes.fa

# Make up knownGenes table, adding uniProt ID. Load into database. Takes 3
# seconds.
txGeneFromBed ucscGenes.bed ucscGenes.picks ucscGenes.gp
hgLoadSqlTab hg18a knownGene ~/kent/src/hg/lib/knownGene.sql ucscGenes.gp

# Make up kgXref table.  Takes about 3 minutes.
txGeneXref hg18 sp070202 ucscGenes.info ucscGenes.picks ucscGenes.ev ucscGenes.xref
hgLoadSqlTab hg18a kgXref ~/kent/src/hg/lib/kgXref.sql ucscGenes.xref

# Make up and load kgColor table. Takes about a minute.
txGeneColor sp070202 ucscGenes.info ucscGenes.picks ucscGenes.color
hgLoadSqlTab hg18a kgColor ~/kent/src/hg/lib/kgColor.sql ucscGenes.color

# Load up kgTxInfo table. Takes 0.3 second
hgLoadSqlTab hg18a kgTxInfo ~/kent/src/hg/lib/txInfo.sql ucscGenes.info

# Make up alias tables and load them. Takes a minute or so.
txGeneAlias hg18a sp070202 ucscGenes.xref ucscGenes.ev foo.alias foo.protAlias
sort foo.alias | uniq > ucscGenes.alias
sort foo.protAlias | uniq > ucscGenes.protAlias
rm foo.alias foo.protAlias
hgLoadSqlTab hg18a kgAlias ~/kent/src/hg/lib/kgAlias.sql ucscGenes.alias
hgLoadSqlTab hg18a kgProtAlias ~/kent/src/hg/lib/kgProtAlias.sql ucscGenes.protAlias

# Make full text index.  Takes a minute or so.  After this the genome browser
# tracks display will work including the position search.  The genes details
# page, gene sorter, and proteome browser still need more tables.
mkdir index
cd index
hgKgGetText hg18a knownGene.text
ixIxx knownGene.text knownGene.ix knownGene.ixx
ln -s /cluster/data/hg18/bed/jkg/index/knownGene.ix  /gbdb/hg18a/knownGene.ix
ln -s /cluster/data/hg18/bed/jkg/index/knownGene.ixx /gbdb/hg18a/knownGene.ixx
     
# Create a bunch of knownToXxx tables.  Takes about 3 minutes:
hgMapToGene hg18a allenBrainAli -type=psl knownGene knownToAllenBrain
hgMapToGene hg18a ensGene knownGene knownToEnsembl
hgMapToGene hg18a gnfAtlas2 knownGene knownToGnfAtlas2 '-type=bed 12'
hgMapToGene hg18a affyGnf1h knownGene knownToGnf1h
hgMapToGene hg18a HInvGeneMrna knownGene knownToHInv
hgsql --skip-column-names -e "select mrnaAcc,locusLinkId from refLink" hg18a > refToLl.txt
hgMapToGene hg18a refGene knownGene knownToLocusLink -lookup=refToLl.txt
hgMapViaSwissProt hg18a knownGene name proteinID Pfam knownToPfam
hgMapToGene hg18a refGene knownGene knownToRefSeq
hgMapToGene "-type=bed 12" hg18a affyUclaNorm knownGene knownToU133
hgMapToGene hg18a affyU133Plus2 knownGene knownToU133Plus2
hgMapToGene hg18a affyU95 knownGene knownToU95
knownToVisiGene hg18a -fromProbePsl=vgAllProbes

# Create expression distance table - takes about an hour
hgExpDistance hg18a hgFixed.gnfHumanAtlas2MedianRatio \
    hgFixed.gnfHumanAtlas2MedianExps gnfAtlas2Distance \
    -lookup=knownToGnfAtlas2

# Create another expression distance table, this one is quicker - 6 min..
hgExpDistance hg18a hgFixed.gnfHumanU95MedianRatio \
	hgFixed.gnfHumanU95Exps gnfU95Distance  -lookup=knownToU95

# This one takes a whle too, 1 hour 22 minutes.
cp -p ~/kent/src/hg/near/hgExpDistance/affyUcla.weight .
time hgExpDistance hg18a affyUclaNorm affyUclaExp knownExpDistance \
	-weights=affyUcla.weight -lookup=knownToU133

# Run nice Perl script to make all protein blast runs for
# Gene Sorter and Known Genes details page.  Takes about
# 45 minutes to run.
mkdir hgNearBlastp4
cd hgNearBlastp4
cat << _EOF_ > config.ra
# Latest human vs. other Gene Sorter orgs:
# mouse, rat, zebrafish, worm, yeast, fly

targetGenesetPrefix known
targetDb jk18
queryDbs mm8 rn4 danRer4 ce3 sacCer1 dm2

mm8Fa /cluster/data/mm8/bed/geneSorter/blastp/known.faa
jk18Fa /cluster/data/hg18/bed/jkg/ucscGenes.faa
rn4Fa /cluster/data/rn4/bed/blastp/known.faa
danRer4Fa /cluster/data/danRer4/bed/blastp/ensembl.faa
ce3Fa /cluster/data/ce3/bed/blastp/wormPep140.faa
sacCer1Fa /cluster/data/sacCer1/bed/blastp/sgdPep.faa
dm2Fa /cluster/data/dm2/bed/flybase4.1/flybasePep.fa

buildDir /cluster/data/hg18/bed/jkg/hgNearBlastp4
scratchDir /san/sanvol1/scratch/jkgHgNearBlastp4
_EOF_
doHgNearBlastp.pl config.ra |& tee do.log 

# Remove non-syntenic hits for mouse and rat
# Takes a few minutes
mkdir /gbdb/hg18a/liftOver
ln -s /cluster/data/hg18/bed/liftOver/hg18ToRn4.over.chain.gz \
    /gbdb/hg18a/liftOver/hg18aToRn4.over.chain.gz
ln -s /cluster/data/hg18/bed/liftOver/hg18ToMm8.over.chain.gz \
    /gbdb/hg18a/liftOver/hg18aToMm8.over.chain.gz
synBlastp.csh hg18a rn4
synBlastp.csh hg18a mm8

# MAKE FOLDUTR TABLES 
# First set up directory structure and extract UTR sequence on hgwdev
    ssh hgwdev
    cd /cluster/data/hg18/bed/jkg
    mkdir -p rnaStruct
    cd rnaStruct
    mkdir -p utr3/split utr5/split utr3/fold utr5/fold
    utrFa hg18a knownGene utr3 utr3/utr.fa
    utrFa hg18a knownGene utr5 utr5/utr.fa

# Split up files and make files that define job.
    faSplit sequence utr3/utr.fa 10000 utr3/split/s
    faSplit sequence utr5/utr.fa 10000 utr5/split/s
    ls -1 utr3/split > utr3/in.lst
    ls -1 utr5/split > utr5/in.lst
    cd utr3
    cat > gsub <<end
#LOOP
rnaFoldBig split/\$(path1) fold
#ENDLOOP
end
    gensub2 in.lst single gsub spec
    cp gsub ../utr5
    cd ../utr5
    gensub2 in.lst single gsub spec

# Do cluster run for 3' UTRs
    gensub2 in.lst single gsub spec

    ssh pk
    cd /cluster/data/hg18/bed/jkg/rnaStruct/utr3
    para create spec
    para try
    para push
# Completed: 36097 of 36097 jobs
# CPU time in finished jobs:     335580s    5593.00m    93.22h    3.88d  0.011 y
# IO & Wait Time:                653230s   10887.16m   181.45h    7.56d  0.021 y
# Average job time:                  27s       0.46m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            1730s      28.83m     0.48h    0.02d
# Submission to last job:          6007s     100.12m     1.67h    0.07d

# Do cluster run for 5' UTRs 
    cd ../utr5
    cd /cluster/data/hg18/bed/jkg/rnaStruct/utr3
    para create spec
    para try
    para push
# Completed: 34011 of 34011 jobs
# CPU time in finished jobs:      78543s    1309.05m    21.82h    0.91d  0.002 y
# IO & Wait Time:                938250s   15637.50m   260.62h   10.86d  0.030 y
# Average job time:                  30s       0.50m     0.01h    0.00d
# Longest running job:                0s       0.00m     0.00h    0.00d
# Longest finished job:            5873s      97.88m     1.63h    0.07d
# Submission to last job:          6139s     102.32m     1.71h    0.07d

# Load database
    ssh hgwdev
    cd /cluster/data/hg18/bed/jkg/rnaStruct/utr5
    hgLoadRnaFold hg18a foldUtr5 fold
    cd ../utr3
    hgLoadRnaFold -warnEmpty hg18a foldUtr3 fold
# There are a three warnings on empty files.  Seems to be a problem in
# RNAfold, so not easy for us to fix. Consequence is not too bad, just a
# few 3' UTRs will be missing annotation.

# Clean up
    rm -r split fold err batch.bak
    cd ../utr5
    rm -r split fold err batch.bak

# Create Vidal and Wanker protein/protein interaction networks and 
# load them.  Takes about 8 minutes.
cd /cluster/data/hg18/p2p
hgNetDist vidal/humanVidal.p2p hg18a humanVidalP2P -threshold=2 \
	-sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"
hgNetDist wanker/humanWanker.p2p hg18a humanWankerP2P -threshold=2 -sqlRemap="select distinct locusLinkID, kgID from refLink, kgXref where refLink.mrnaAcc = kgXref.mRNA"

###########################################################################
## Load take 9 into the database. Takes 2 minutes.
ssh hgwdev
cd /cluster/data/hg18/bed/jkg
cat refSeq/*.psl | hgLoadPsl hg18 stdin -table=jkgInputRefSeq
cat mrna/*.psl | hgLoadPsl hg18 stdin -table=jkgInputMrna
hgLoadBed hg18 jkgRefSeqBed refSeq/*.bed
hgLoadBed hg18 jkgMrnaBed mrna/*.bed
hgLoadBed hg18 jkgAntibodyBed antibody.bed
cat refSeq/*.unusual | cut -f 1-4 |  hgLoadBed hg18 jkgRefSeqUnusual  stdin
cat bedToGraph/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=jkgNativeGraph.sql hg18 jkgNativeGraph stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoExons stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoIntrons stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstExons stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstIntrons stdin
hgLoadBed -strict hg18 jkgTxWalk txWalk/*.bed
hgLoadBed hg18 jkgAltSplice altSplice.bed
ldHgGene hg18 jkgTxCdsPick pick.gtf -gtf -genePredExt
hgLoadCdsEvidence hg18 jkgTxCdsEvidence weighted.tce
hgLoadSqlTab hg18 jkgTxInfo ~/kent/src/hg/lib/txInfo.sql separated.info
ldHgGene hg18 jkgTxCdsRepick weededCds.gtf -gtf -genePredExt
hgLoadSqlTab hg18 jkgUcscGenes ~/kent/src/hg/lib/knownGene.sql weeded.gp

###########################################################################
## Load take 10 into the database. Takes 30 seconds.
ssh hgwdev
cd /cluster/data/hg18/bed/jkg
cat refSeq/*.psl | hgLoadPsl hg18 stdin -table=jkgInputRefSeq2
cat mrna/*.psl | hgLoadPsl hg18 stdin -table=jkgInputMrna2
hgLoadBed hg18 jkgRefSeqBed2 refSeq/*.bed
hgLoadBed hg18 jkgMrnaBed2 mrna/*.bed
hgLoadBed hg18 jkgAntibodyBed2 antibody.bed
cat refSeq/*.unusual | cut -f 1-4 |  hgLoadBed -noStrict hg18 jkgRefSeqUnusual2  stdin
cat bedToGraph/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=jkgNativeGraph2.sql hg18 jkgNativeGraph2 stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoExons2 stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoIntrons2 stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstExons2 stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstIntrons2 stdin
hgLoadBed hg18 jkgTxWalk2 txWalk.bed antibody.bed
hgLoadBed hg18 jkgAltSplice2 altSplice.bed
ldHgGene hg18 jkgTxCdsPick2 pick.gtf -gtf -genePredExt
hgLoadCdsEvidence hg18 jkgTxCdsEvidence2 unweighted.tce
hgLoadSqlTab hg18 jkgTxInfo2 ~/kent/src/hg/lib/txInfo.sql separated.info
ldHgGene hg18 jkgTxCdsRepick2 weededCds.gtf -gtf -genePredExt
hgLoadSqlTab hg18 jkgUcscGenes2 ~/kent/src/hg/lib/knownGene.sql weeded.gp


###########################################################################
## Load take 6 into the database. Takes 30 seconds.
ssh hgwdev
cd /cluster/data/hg18/bed/jkg
cat refSeq/*.psl | hgLoadPsl hg18 stdin -table=jkgInputRefSeq
cat mrna/*.psl | hgLoadPsl hg18 stdin -table=jkgInputMrna
hgLoadBed hg18 jkgRefSeqBed refSeq/*.bed
hgLoadBed hg18 jkgMrnaBed mrna/*.bed
hgLoadBed hg18 jkgAntibodyBed antibody.bed
cat refSeq/*.unusual | cut -f 1-4 |  hgLoadBed hg18 jkgRefSeqUnusual  stdin
cat bedToGraph/*.txg | txgToAgx stdin stdout | hgLoadBed -notItemRgb -sqlTable=jkgNativeGraph.sql hg18 jkgNativeGraph stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoExons stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' txOrtho/edges/*.edges | hgLoadBed hg18 jkgOrthoIntrons stdin
awk '$8=="exon" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstExons stdin
awk '$8=="intron" {print $1,$2,$3,$4,$5,$6}' est/*.edges | hgLoadBed hg18 jkgEstIntrons stdin
hgLoadBed -strict hg18 jkgTxWalk txWalk/*.bed
hgLoadBed hg18 jkgAltSplice altSplice.bed
ldHgGene hg18 jkgTxCdsPick pick.gtf -gtf -genePredExt
hgLoadCdsEvidence hg18 jkgTxCdsEvidence weighted.tce
hgLoadSqlTab hg18 jkgTxInfo ~/kent/src/hg/lib/txInfo.sql separated.info
ldHgGene hg18 jkgTxCdsRepick weeded.gtf -gtf -genePredExt


###########################################################################
## Some analysis of things in uniProt.fa that didn't map to any sequence.
## Examining random sample of 25 of these.  (Only 81% did map, over 10,000
## didn't).

Summary:
  7 - SE - Single exon gene dropped by orthoSplice, usually wrongly.
       fix orthoSplice a little, rework single exon handling.
  4 - TS - too small (< 18 AA)
       ignore
  4 - DI - divergend (%ID < 90)
       ignore - it's ok, all examples in HLA, best not to map
          wrong haplotypes to genome.
  3 - RI - retained intron nicely filtered out by orthoSplice
       Yay!
  2 - NM - no mRNAs mapped in region. SwissProt based on DNA clone annotation 
      ignore - want stronger evidence than this.
  1 - WS - weird splicing, strange intron boundaries, decent evidence for them...
       Hmm, send this example to Al to look at?
  1 - DE - exon dropped, only one mRNA supports it in alt situation.
       Hmm. Probably just live with this.
  1 - TC - Part of variable region of T-cell receptor. Splicing wierd.
       Hmm.  Ignore/make separate process for Ab fragments and T-Cell receptor
       fragments?
  1 - NE - noisy ends or retained intron. Probably as well as not to filter out.
       Ignore.
  1 - SC - Single exon transcript dropped by altSplice.  Overlaps spliced transcripts.
      Consider making altSplice better on single exons in midst of other stuff
      ... or not.


DI A0AQY6 - only 89.9% ID on best match to genome (matches in chr6_xxx_hap)
DI A1E127 - only 88.0% ID on best match to genome (matches in chr6_xxx_hap)
TS P30095 - 10 letters long - couldn't web blat to genome
RI Q16193 - Looks to be derived from transcript with retained intron. Dropped
         by orthoSplice.
TS Q16217 - 11 letters long - couldn't web blat to genome
DI Q571Q4 - only 89.9% ID on best match to genome (matches in chr6_xxx_hap)
RI Q59GH2 - Looks to be derived from transcript with retained intron. Dropped by
	 orthoSplice
NM Q5NV77 - We map no RNAs there.  Ensembl has a prediction. SwissProt entry
	    based on DNA clone, not RNA
DI Q5QR74 - no blat alignment to genome. 24 AA. NCBI - hits HLA region of alt assembly
NM Q5TEV5 - We map no RNAs there. Ensembl has a prediction. SwissProt entry
	    based on DNA clone, not RNA
TC Q5ZGK9 - no blat alignment to genome. 75 AA. NCBI - 57 bases hit at 100% ID
	    (76% cov) on chr7. In raw.psl. Not clear why online blat fails,
	    command line finds it.  COmmand line does have higher sensitivity.
	    No repeat involvement.  Looks like RNA gets dropped because of 
	    noncannonical ends to large intron.  Huge pile up of mRNAs.
	    Is variable region of a T-cell receptor
SE Q6FHK0 - Single exon gene dropped by orthoSplice in spite of RefSeq, mRNA, and
	 exoniphy.
NE Q6PIG1 - Only 73/111 bases covered.  Filtered out by pslReps.  Noisy
	 ends/retained intron perhaps?  Is on chr9.
SE Q6ZP23 - Single exon in midst of intron on same strand.  No ensembl or
	 exoniphy. Does overlap another single exon transcript, but in a very
	 loose way.
NM Q6ZRI4 - no blat alignment to genome. 123 AA. NCBI - has gap of 14 against genome.
         No RNAs in browser map to this region, though there is an Ensembl
	 gene.... Curious that protein blat misses via web.... Nucleotide blat finds
	 it.
SE Q71MG6 - no blat alignment to genome. 96 AA. NCBI - 100% coverage 100% ID to chr9
	 aligns to 100% masked region, a middle aged LINE L1.  There are mRNA
	 that make it to clusters and altSplice, but get cut at orthoSplice
	 (which seems wierd since there are two RNAs.  Oh, but one seems to
	 get lost at the altSplice stage, the longer one.)
WS Q76I85 - several weird splice sites in the middle end up breaking up this
         transcript at the altSplice stage. Two genbank rna transcripts and
	 a refSeq support this though.  Curious!
SE Q7YCG6 - best mapping has lots of stop codons, only 94% identity. Dropped by
         orthoSplice in spite of a small pile-up of RNAs in region..
SC Q8N9M9 - no blat alignment to genome. 124 AA. NCBI - 100% coverage 99% ID to chr3
         Single exon transcript CDS entirely in repeat (LTR). Overlaps spliced
	 alignments. No exoniphy. Just single transcript.
SE Q8NGJ9 - single exon refSeq dropped by orthoSplice in spite of exoniphy overlap
DE Q96DZ9 - an exon gets dropped by orthSplice
TS Q9UC29 - 12 letters long - couldn't web blat to genome
TS Q9UC89 - 17 letters long - couldn't web blat to genome
SE Q9UCZ0 - orthoSplice drops single exon gene in spite of Exoniphy & refSeq.
SE Q9UCZ2 - orthoSplice drops single exon gene in spite of Exoniphy & refSeq.

###########################################################################
## Some specific regions of interest.

chr22:41,921,231-41,927,980 - two overlapping single exon transcripts.
      exonWalk keeps shorter of two.  Mouse synteny, but not well
      conserved in mammals. Ensembl gene and UCSC known gene on shorter
      transcript. RNA from longer transcript has copy number variation
      compared to genome.
    Take 2 - now two separate transcripts because of 19 base insert in
    both RNAs.  Is a polymorphism? Is in a simple tandem repeat.
chr22:42,545,257-42,600,891 - two single exon transcripts overlap the last
      exon of a cluster of spliced transcripts. Good mouse synteny throughout
      Ensembl, UCSC Known ignor single exon transcripts. Makes it through
      orthoSplice, but single exon transcript not emitted by exonWalk.
    Take 2 - looks like still need some ExonWalk fixing to handle this.
chr22:43,265,931-43,274,172 - gene mostly single exon but short 5' UTR exon
      as well.  RefSeq/mRNA is enough to get this into set. 
    Take 2 - looks like this is working decently now.
chr22:44,324,300-44,435,566 - gene with 6 single exon transcripts overlapping
      final spliced exon.
    Take 2 - single exon transcripts ignored.  Maybe ok in this case.
chr22:45,008,587-45,039,578 - Four rna clusters in 30 kb.  Good mix of spliced
      and single exon.
    Take 2 - Not real sure about how AK127072 is handled.
chr22:45,067,214-45,074,098 - Single exon gene. 5 RNAs. filtered out by
      orthoSplice. On main syntenic mouse chain, but looks like low 
      coverage of the alignment in this region. No Exoniphy. Has Ensembl
      and current UCSC Known.
    Take 2 - The single exon gene seems nicely done by the median end
      handling. 
chr22:14,632,219-14,671,164 - Two genes on opposite strands.  Smaller
      2 exon gene gets turned into intron only by orthoSplice, and this
      is read as exon by exonWalk.  Bigger one is c set, smaller d set.
    Take 2 - Seems fixed.

###########################################################################
## Some specific regions of interest take two.
chr22:27,492,855-27,536,126 - Two genes with some transcripts that are 
      nicely cleaned up by the txPslToBed phase.
chr22:34,911,084-34,940,293 - Region of spotty orthology. Terminal exon
      of BC006276 removed correctly since no other support. There are
      two orthologous adjacent exons, but no orthologous intron in between.
      This seems correct, as mouse has an additional exon in middle...
chr22:35,005,411-35,115,427 - Strongly orthologous region. Myosin heavy chain.
      A lot of fragmentary transcripts seem to be nicely merged in graph.
      What looks like it might be alt splicing at a distance is revealed
      as noise by the txPslToBed phase.
chr22:35,213,851-35,218,377 - 3' end of reverse strand gene with 11 mRNAs.
      It selects the longest for the 3' end, as does RefSeq.  Probably a 
      good thing here, but ... not sure how the soft end attatched to
      hard end consensus code is actually working, I'd have thought it
      might kick in sooner.  That's right, it needs *5* to override
      longest.  Maybe that's good for ESTs, but for mRNA, perhaps 1 or
      two should do?
chr22:35,236,804-35,236,989 - Similar case where there's enough mRNAs (20?)
      that longest is overridden.  Seems maybe reasonable.  Looks like this
      gene may have several polyadenylation sites.  On the other hand, 
      RefSeq does choose the largest, so perhaps we should stick with them?
chr22:35,232,252-35,233,254 - Looks like has perhaps three promoters, two
      of which are conserved in mouse, and good RNA evidence for the third.
      All seem preserved in the trimmed graph.
chr22:35,251,888-35,252,258 - Longer exon, caused by being initial rather
      than middle exon in some trancripts, supported by two mRNAs and mouse
      orthology, preserved in trimmed graph.
      Take 3 - still good.
      Take 4 - lost longer exon. No problem though, it's just 6 bases longer,
               and take 4 snaps at 6 rather than 5 bases.
chr13:29,797,646-29,834,682 - Not clear why last exon of CR598046 survives
      trimmiing.  Looks like this is because the single exon mRNA AK130570, 
      which does overlap the third exon, gets assigned as support for the
      first exon instead!?  Likely a fixable bug with a fairly small test
      case.
      Take 3 - looks fixed.
      Take 4 - still good.
chr6:132,312,312-132,312,480 - Single mRNA with a slightly shorter exon
      looks like it gets swept up in orthology with larger exon, and preserved
      even when it shouldn't be.
      Take 3 - still a problem.
      Take 4 - still a problem.
chr6:132,495,393-132,514,405 - Single initial exon kept from two fairly
      unrelated transcripts that share a single splice site.  Maybe this
      is ok?  Maybe want a little stronger filter here somehow....
      Note in browser at view of chr6:132,427,897-132,618,026 these
      don't look to line up as well as perhaps they should.
      Take 3 - still the same.
      Take 4 - still the same.
chr6:132,690,933-132,690,960 - A soft end that should be merged into
      nearby shorter hard end.  Looks like just mRNA ended barely the
      other side of the splice site, and blat didn't find the other
      exon, and instead continued...  Likely a fixable bug with a small test
      case.
      Take 3 - fixed.
      Take 4 - still fixed.
chr20:33,329,929-33,336,101 - The RNA BC014403 is on the wrong strand,
      and thus gets broken up at each exon because of bad splice sites.
      The txPslToBed phase should fix this instead. A fixable bug with smalll
      test case.
      Take 3 - still busted.
      Take 4 - fixed.
chr20:33,336,445-33,343,866 - Good test case for exonWalk.  Two transcripts
      that share some splice sites, but vary quite a bit.  Could get
      combinatorical easily if using a niave walker.  ExonWalk does just
      generate 2 transcripts here, but I don't agree with them.
      Take 3 - still good.
      Take 4 - still good.
chr20:33,351,905-33,360,150 - Seems like several large unspliced mRNAs get
      merged into the final exon inappropriately.  They should be kept
      separate, merged with each other instead maybe?
      Take 3 - fixed.
      Take 4 - still fixed.
chr20:33,414,646-33,451,743 - Some relatively complicated alt-splicing.
      Another good test case for the walker.  Grapher seems to be doing
      the right thing.
      Take 3 - still good.
      Take 4 - still good.
chr22:47,213,740-47,601,299 - Another case where the wrong exon got assigned
      support for an RNA?  See txWalk of AY358847.
      Take 3 - better.  Looks like this had two bugs, and just one was fixed.
               Other bug is mRNA that needs flipping to other strand.
      Take 4 - better still.  Still we have one stray little (45 base)
               single base exon left.  It's a fragment of a larger transcript.
	       Perhaps could put in a rule if there's a larger transcript
	       that a single exon gene is part of, the single exon bit of
	       it vanishes?

###########################################################################
## Other notes
The immunoglobulin heavy chain cluster on chr14 is exceptional, and
difficult to cluster.  There's about 10,000 overlapping mRNAs in the
region.  This made many things slow on chr14, particularly the
txgAddEvidence program.  In response I put a "maxJoinSize" option in
txBedToTxg that prevents adding soft introns larger than 70k.  This
cut the largest cluster from 2191 edges to 1008.

This was the old txOrtho run:
CPU time in finished jobs:      17441s     290.68m     4.84h    0.20d  0.001 y
IO & Wait Time:                   403s       6.72m     0.11h    0.00d  0.000 y
Average job time:                 364s       6.07m     0.10h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            3437s      57.28m     0.95h    0.04d
Submission to last job:          3437s      57.28m     0.95h    0.04d

After change this became:

CPU time in finished jobs:      17963s     299.39m     4.99h    0.21d  0.001 y
IO & Wait Time:                   402s       6.70m     0.11h    0.00d  0.000 y
Average job time:                 375s       6.25m     0.10h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            2850s      47.50m     0.79h    0.03d
Submission to last job:          2850s      47.50m     0.79h    0.03d

The longest job is now chr1, not chr14.  This also made the
txgAddEvidence for chr14 go down to 0.3 seconds.

Also optimized the txOrtho to use rangeTrees to index the chainBlocks.  Before
the change the run time was:
Completed: 49 of 49 jobs
CPU time in finished jobs:      18162s     302.70m     5.04h    0.21d  0.001 y
IO & Wait Time:                   450s       7.50m     0.13h    0.01d  0.000 y
Average job time:                 380s       6.33m     0.11h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:            2901s      48.35m     0.81h    0.03d
Submission to last job:          2901s      48.35m     0.81h    0.03d

After it's

Completed: 49 of 49 jobs
CPU time in finished jobs:       1916s      31.93m     0.53h    0.02d  0.000 y
IO & Wait Time:                   540s       9.00m     0.15h    0.01d  0.000 y
Average job time:                  50s       0.84m     0.01h    0.00d
Longest running job:                0s       0.00m     0.00h    0.00d
Longest finished job:             235s       3.92m     0.07h    0.00d
Submission to last job:           235s       3.92m     0.07h    0.00d


###########################################################################
## Feb 14 notes -
Rewrote graph maker (now txBedToGraph) and walker (now txWalk).  

The graph maker hopefull will eliminate a number of small problems
with the old graph maker, as well as being considerably more efficient.
The txGraph/txBedToGraph/makeGraph.c module is essentially a rewrite of 
hg/lib/ggGraph.c. It relies heavily on the rbTree and rangeTree modules,
in the process eliminating a lot of linear list traversal. 

The walker is less ambitious then exonWalk in some ways.  It is not
even trying to build gene structures out of ESTs.  It stitches
together exons purely based on transcripts it has seen.  It does
do a good job of a consensus genome-projection of exons from RNA
alignments. I think it does a pretty good job of selecting a minimal
number of transcripts that cover all exons.  It give explicit and
overriding weight to RefSeq transcripts.

Weird regions still:
	chr21:39,478,728-39,497,245 - broken up RefSeq NM_033656 exon
	    This is a case where there is a 7 base gap in the alignment
	    of a huge exon.  It's 6366 bases on one side of the gap and
	    820 bases on the other side.  This does get broken up by 
	    txPslToBed, but both sides are emitted.
	      There's two ways the txBedToGraph goes wrong here.
	    first, the consensus really does need to favor refSeq if
	    available. Second, double-soft-end exons need to be merged
	    into hard-ended exons that nearly or completely cover them.
	   Take 4 - fixed.
	chr21:39,628,929-39,649,503 - retained intron.
	   Take 4 - still there.  We'll want to keep this actually
	            until we get to the ranking/filtering step.
	chr19:147,747-153,011 - broken up and little pieces
	        retained in spite of no other support?
	   Take 4 - fixed.  Actually looks fixed in take 3.

Important remaining issues:
1) Deal with fixing strand at txPslToBed phase. - done
2) Make end consensus based on median of evidence, not
   median of ends. - done
3) Make orthologous support more specific - if it hits an
   exact one (both hard ends) then *only* increase weight there.
   For inexact matches down-weight significantly.
4) Add EST evidence to pipeline.

###########################################################################
# For take 4 here are 10 cases where txWalk doesn't overlap RefSeq
1) chr7:115,717,168-115,717,341 - single exon fragment on wrong strand.
2) chr11:116,446,694-116,447,535  - Single exon gene with two mRNAs supporting it.
3) chr21:33,066,282-33,091,884 - spliced gene with 4 transcripts
4) chr22:31,235,858-31,237,788 - two mRNA. Have 2 long exons connected by 10k
   gap with non-intronic ends.  One exon has mouse ortholog.  Looks to me
   like perhaps a mutation/sequencing error in the reference genome.
5) chr2:118,488,220-118,489,717 - single exon gene with two mRNAs supporting it.
6) chr2:234,349,109-234,353,167 - interesting case. Three RNAs support it. All
   Three of the RNAs seem to have an exact repeat covering half of themselves
   that isn't in genome. All three are from same project - the ENCODE project!
7) chr13:112,742,663-112,743,328 - Single initial exon of 2-exon mRNA with
   non-cannonical splice sites. Terminal exon has support in mouse, and appears
   as a second "gene".  Looks like could use a step at end of txWalk that
   merges together cases like this.
8) chr14:98,772,931-98,773,042 - Single exon supported single mRNA and mouse.
9) chr2:220,128,116-220,132,463 - Cool splicing. Two mRNAs. Exoniphy and mouse
   support.
10) chr20:33,609,922-33,651,008 - Cool splicing. 6 mRNAs, Exoniphy and mouse
    support for many.

###########################################################################
# For take 5, here are 10 cases where RefSeq CDS doesn't map
chr16	843634	960985	NM_022773	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   Refseq disagrees with genome leading to 4 bp "stray" close to end of second exon.
   Other RNAs agree with genome.  Looks like it is good to split here.
  After merging leads to wacked out long exon.
  Fixed.
chr2	31603159	31659544	NM_000348	CDS	FULL	.	.	GAP<7	.	.	BUST<7b	.	.	.	.
   1 BP insert in RNA relative to genome in exon 1. Two other RNAs including MGC full orf
   support refSeq.
  Seems ok now.
chr12	29471755	29541886	NM_183378	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   No RNA here beyond refSeq, which has noncannonical end of intron 27. This does look like
   one that a later stage could stitch together. (No gap in query side of sequence).
  Seems ok now.
chr1	212228482	212276385	NM_002763	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   AT/AC intron 2 causes split.  AT/AC supported by an independent RNA.  It again looks like
   one that will be stitched at a later phase.
  Seems ok now.
chr1	200064910	200120045	NM_018085	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   Noncannonical intron split that will restitch.
  Seems ok now.
chr19	18655424	18749762	NM_025021	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   Noncannonical intron split that will restitch.
  Seems ok now.
chr5	77692094	77812318	NM_004866	CDS	FULL	.	.	GAP<7	.	.	BUST<7b	.	.	.	.
   1 BP insertion in mRNA relative to genome in exon 7.  Of RNAs in region, 7 support RefSeq, 1 supports genome.
  Turns into monster exon.
  Fixed.
chr11	102238673	102250922	NM_002426	CDS	FULL	.	.	GAP<7	.	.	BUST<7b	.	.	.	.
   1 BP insertion in mRNA relative to genome in exon 5. 2 RNAs in region support RefSeq, 1 supports genome.
  Turns into monster exon.
  Fixed
chrX	128766593	128805149	NM_001008222	CDS	FULL	.	.	.	SPLIT	.	.	.	.	OUT	.
   Noncannonical intron split that will restitch.
  Seems ok now.
chr1	1560962	1645635	NM_033492	CDS	FULL	NUDGE	.	GAP<7	SPLIT	.	.	.	.	OUT	.
   Alignment of 5' UTR is very messy.  CDS Alignment better, but still has some gappy stuff.  There are 18
   mRNAs that cover this regions.  Part of a duplicated region. RefSeq shows 15 different isoforms.  This
   is the stuff of which curator nightmares are made! 
  Seems more or less ok.

###########################################################################
# Weird regions take 5
chr22:17,546,746-17,665,740 NM_007098 has wacked out huge exon.
  fixed.

NM_004866	chr5	77692094	9
77692094,77720416,77747099,77748120,77750354,77753380,77781512,77790823,77807082,
0,0,0,0,1,1,2,2,0,
0,0,0,0,2,2,1,0,1,0,

############################################################################
# Things to fix and test cases for take 6.  Looked at perhaps 5 megabases
# of chromosome 1, and first part of chr22 (up to chr22:22,311,744-22,405,735)

pslCdnaFilter - run so as to better pick better of two mappings.
        Test case: chr22:20,150,617-20,157,116 AK093171
    Nope, still passes.
abFragFilter - get rid of all but selected few antibody fragments.
        Test case: chr22:20,946,751-21,035,748
    Seems to be working well. It passes LOC96610, which is a gene in the
    middle of region that shares a few exons with AB fragments, but suppresses
    at least the vast majority of AB fragments.
txBedToGraph  - Is end of CU013202 acting as hard stop and hard end?  Why
         is CU013202 a separate transcript, not merged with NM_003347?
     This is from CU013202 getting moved to next hard end, and
     NM_003347 extending far beyond it.   It should come out of
     the wash when we do away with transcripts completely enclosed
     by other transcripts.  There is indeed a site here that is both
     a 5' and a 3' splice site.  (The sequence is agtg, and there's
     a very nice RNA supporting each use.)
txBedToGraph  - Why are BC040020 and NM_080764 not merged?
     Hmm, graph looks ok.  Maybe a walker weirdness from BC040020
     being broken up *and* the smaller fragment overlapping the
     extension of the larger fragment by the refSeq UTR.
     Keep an eye on it for chr22.601.
txAddEvidence - Don't add evidence for double-hard-end that is only supported
         on one end by another double-hard-end.  Test case
	 chr22:19,221,254-19,221,743 the slightly larger exon on AK074268.
     This seems to be added to txAddEvidence in general, but the test
     case is rescued anyway by EST evidence this round.
txAddEvidence - Suppress adding hard-end evidence to double-soft that
         overlaps.  Test case chr22:22,298,169-22,319,055.
     Fixed take 6.
txWalk - make it output all transcripts that hit an edge with sufficient
         evidence, and which are not subsets of previous transcripts.
	 Test case chr1:67,646,622-67,669,121, currently only outputs
	 3 of 4 refseqs.
     Fixed take 6.
txWalk - for single-exon transcripts that overlap weakly, don't count
         overlap as 2 pieces of evidence, just 1.1 or something.
	 Test case chr22:20,109,925-20,125,924.
     Fixed take 6.
txWalk - Remove transcripts completely enclosed on tx level that differ
         only at 3' or 5' end.  Test case NM_176877 and AF397170. Also
	 NM_005446 and AF065385.
     Fixed take 5.
txgAnalyze  - flag retained introns. Test case chr22:17,280,005-17,306,568.
     Done take 6.
txFilter - Remove transcript that are result of fragmented RNA
         where larger part of RNA is in another transcript.
	 Test case chr22:19,863,475-19,871,029.
     This case fell out of the wash with the single exon handling.
	 Test case chr22:18,080,172-18,095,585
txFilter - Merge transcripts that are the result of fragmented RNA
         where good evidence and multiple introns on both sidex.
	 Test case chr22:19,881,325-19,920,204.
     This case fell out of the wash with the more stringent best in 
     genome mRNA filter.  This is good, because it would be a real
     pain to fix, and hopefully it is rare enough it can be ignored.
txFilter - Remove single exon genes that are more than 55% enclosed within
	   exon of multiple exon gene.  Test case chr22:18,608,081-18,611,968.
	   Also chr22:19,970,725-19,975,524
	 First test case handled by other single exon filters.
	 Second test case fell out of the wash with the more stringent
	 mRNA placements.
txCdsAddEv - Process selano-cysteines and anternative starts.
txCdsPick - Pick RefSeq CDS for RefSeq genes.
txClusterAnalyser - Identify CDS that is not shared with dominant CDS
           from transcript cluster and remove.

######################################################################
# Stuff to consider for take 7
o - What happened to  NM_032476 at chr21:34,351,145-34,448,007?
    Looks like a txWalk problem.  Fixed.
o - When gene has retained intron and ORF doesn't overlap (in frame)
    ORF of other genes in cluster, consider nuking ORF.  Hmm, this
    seemed to cause more problems than it solved.
o - Downweigh ORF score when NMD candidate: chr21:32,865,403-32,872,381.
    (Perhaps taking this too far now)
o - Another dubious ORF chrX:153,925,985-153,942,224 AX746776 from 
    incompletely processed transcript?  NMD candidate.
o - chr19:59,394,825-59,444,662  A transcriptoinally active cluster.
    Looks like maybe falsesly separatee into two by refSeq.
o - chrX:153,917,080-153,942,391 is a good example of a CDS to nuke.
    The bestorf score is 0. It is an NMD target. It doesn't overlap
    with other CDS's in region.
o - chr13:112,573,175-112,597,164 is an example of a single exon transcript
    that probably should be removed from overlapping too much a coding
    transcript.
o - chr19_random:158,550-294,266 - very difficult region.  What we're
    doing is not yet enough to separate protein coding genes.  They all
    end up in one big cluster even though refSeq, rightly I think, has
    several distinct genes.  At any rate I don't know how to sort this out.
o - chr5:69,470,446-69,917,303 - another very difficult region.
o - chr22:43,456,057-43,654,296 - another region with transcripts that
    cross genes.  
o - chrX:152,822,047-152,827,969 is a good place for CDS frame clustering
    and maybe reevaluating of BC080603's CDS.

######################################################################
# Stuff to consider for take 8
o - Leave in NMD ORFs, especially for RefSeq, just mark as bad?
     - Done
o - Change accession from TX12345678 to UC1234567?
     - Not yet
o - Put in soft-join at funny introns like we do with RefSeq.
    It's just a bug that we don't.  Fixed!
     - Fixed, but this ended up making a lot of funny transcripts,
       so took out the soft join.
o - Add in antibody's as noncoding genes at a fairly late phase.
     - Done.  Not a too horrible kludge
o - Remove accessions in
	markd/compbio/mgc/cListPaperData/data/hgdata/athersysRage.acc
                                                invitrogenCrap.acc
    from input.


######################################################################
# Stuff to consider for take 10
o - Gather some information on dog and maybe mouse conservation in
    txInfo table.  Possibly:
        ORF size in dog.
	ORF size in mouse.
	Ka/Ks within ORF in dog and mouse.
	Transcript size in dog.
	Transcript size in mouse.
o - Put in rRNA/tRNA/snoRNA etc labels in category field of txInfo where
    appropriate.
o - Do tighter near-best placements, using Robert & David's method.
o - Separating clusters that contain non-overlapping refSeq clusters
    into separate genes.  See chr22:16,936,036-17,018,244. BUt how???
o - Here refSeq has two genes with same transcript just to get different
    proteins.  chrX:153,939,681-153,954,680

######################################################################
# Examining 10 cases in KG2 not Take 7
chr17:50,384,266-50,387,243 - AK124809. Single exon gene without
  quite enough support.  1.5 mRNA.  0 EST, Exoniphy, Ortho.
chr1:1,327,156-1,332,490 - AK000886 not enough evidence for this
  isoform, which appears to have a retained intron.
chr2:43,307,984-43,309,498 - Hmm.  Three mRNAs for this single exon
  clone, but two of them are just in middle, CDS region.  Long UTR
  of third clone washes this out, perhaps inappropriately.  No orthology
  spliced est or exoniphy to rescue it.
chr2:111,903,366-111,904,956 - Another single exon gene.  This one overlaps
  a spliced gene.  Has a second RNA that covers most of it, and some
  mouse orthologous transcription evidence.  Still, looks much like a
  retained intron. Arguably should be rescued.
chr2:201,742,307-201,744,647  Another single exon gene. 3 RNAs overlap
  parts of it, but 2 are jsut a small part, and third only 2/3rds.  I wonder
  if the coverage decision is made after the reduction to median though.
  This might rescue this case, though not clear we should.
chr3:47,578,733-47,597,286 - We choose shorter 3' UTR to follow refSeq.
chr3:49,272,522-49,273,751 - Single exon gene with on other mRNA.
   It does have a reasonable pile up of ESTs on the 3' end (also 
   unspliced).
chr3:196,899,961-196,903,089 - Single exon gene with 1.5 supporting mRNAs
chr4:82,609,187-82,609,498 - Single exon gene with only 1 supportin RNA
chr6:14,322,418-14,326,268 - Single exon gene with only 1 supporting RNA

#####
# Preparing hg18a -
Copied over files from /var/lib/mysql/hg18 to /var/lib/mysql/hg18a
except for the chain and net tables, and the following tables:
n/a HInv
n/a HInvGeneMrna
n/a all_bacends
n/a all_fosends
n/a bacEndPairs
n/a blastKGPep04
n/a blastKGRef04
done ceBlastTab
done dmBlastTab
done drBlastTab
done foldUtr3
done foldUtr5
n/a fosEndPairs
done gnfAtlas2Distance
done gnfU95Distance
???? humanBlastTab - looks to be obsolete
???? humanHprdP2P
done humanVidalP2P
done humanWankerP2P
done kgAlias
done kgProtAlias
???? kgProtMap
???? kgSpAlias
done kgXref
done knownBlastTab
done knownCanonical
done knownExpDistance
done knownGene
done knownGeneMrna
done knownGenePep
done knownIsoforms
done knownToAllenBrain
done knownToEnsembl
done knownToGnf1h
done knownToGnfAtlas2
done knownToHInv
???? knownToHprd
done knownToLocusLink
done knownToPfam
done knownToRefSeq
done knownToU133
done knownToU133Plus2
done knownToU95
done knownToVisiGene
done mmBlastTab
n/a multiz17way
n/a multiz17wayFrames
n/a multiz17waySummary
n/a oreganno
n/a oregannoAttr
n/a oregannoLink
n/a phastCons17way
n/a phastConsElements17way
n/a regPotential17X
n/a snp126
n/a snp126ExceptionDesc
n/a snp126orthoPanTro2RheMac2
n/a snpSeq
???? pbAnomLimit
???? pbResAvgStd
???? pbStamp
???? pepCCntDist
???? pepExonCntDist
???? pepHydroDist
???? pepIPCntDist
???? pepMolWtDist
???? pepMwAa
???? pepPi
???? pepPiDist
???? pepResDist
done rnBlastTab
done scBlastTab
n/a uniGene_3
n/a xenoEst
n/a xenoMrna
n/a xenoRefFlat
n/a xenoRefGene
n/a xenoRefSeqAli

#####################
SVM STUFF
cat txWalk/*.bed | txCdsSvmInput stdin NR_unweighted.tce cdsOrtho.tab -bad bad.lst train.vector
cat txWalk/*.bed | txCdsSvmInput stdin NM_unweighted.tce cdsOrtho.tab -good good.lst stdout | randomLines stdin 604 stdout >> train.vector
svm_learn train.vector train.model
#######################
Issues for take 10
chr10:70,562,806-70,562,873 - Here a refSeq peptide gets mapped, somehow
    forces a geomic frame shift up throughthe remapping.  This gets lost in the
    final gene set though....  This is back more or less.  The shifted codon
    in the mRNA gets cut out.  This is reasonable now.
chr14:104,277,141-104,277,163 - Another case where boundaries wiggle in
    ucscGenes vs cds Mappings....  In this case no "genomic frame shift"
    Still does involve using a refPepValidated that's in two frames though.
chr21.490.2.AK128598 chr21:44,050,073-44,056,876 - A case with CDS in
    long exon bleed. Should be flagged for NMD but isn't.
NM_014638 Another case where boundaries shift in ucscGenes vs. cds mappings.
    fixed.

######################
Notes for take 11:
NM_004390, which overlaps another refseq, seems to have fallen out
somewhere between the graph and walking stage.
X17115 - use as IgM heavy chain?

Please take out
 /cluster/data/genbank/data/exceptions/invitrogenFullLength.acc

Increase txCdsThreshold to 871.42.
