#This describes how to lift annotations from one assembly to another.
#It works best if the assemblies are relatively close.
#
# NB - This instructions are not tested yet.  It's best to
# run them one at a time and check results for now. 

# Here are some general guidelines for creating and using 
# liftOver chain files ("over.chain"). Note that the "old" assembly
# is also the "from", "target", and "reference" assembly.
# Most of the work will happen in the "old" assembly build area.
# and be doc'ed in its make doc.
#
#-------------------------------------------------------------------------
* Name liftOver chain files:
        <OLD assembly>To<New assembly>.over.chain
            e.g.  hg15ToHg16.over.chain

* Name liftOver chain tables:
        chainTo<New assembly> in the OLD assembly database
                e.g.  hg15: chainToHg16

* Same organism lifts
    - Split the NEW assembly (query) into 3K chunks (faSplit -oneFile)
        and save on cluster disks (or iservers, or bluearc), along with
        lift files.  Doc this in the make doc for the NEW assembly.
    - BLAT the NEW assembly (query) against the OLD assembly (target),
        then chain (axtChain), net (chainNet), and 
        extract single-coverage chains from the net (chainNetSubset)
        Do this in the OLD assembly build area, in the directory
        bed/blat.<NEW-assembly>.date
    - For convenience link the chain file into the OLD assembly
        directory bed/bedOver
    - Save gzipped files in the "liftOver" download directory for the 
        OLD assembly:
            e.g. goldenPath/hg13/liftOver/hg13ToHg16.over.chain.gz
    - When lifting annotations, distinguish from "native" annotations
        by naming the build dir and table:
              <track>Lift      e.g.   ensemblGeneLift 
        Do this in the NEW assembly build directory and make doc.
    - NOTE: When loading a new genome, create a liftOver chain for each 
            earlier assembly that is still public.

* Other organism lifts (homology mapping)
    - Create liftOver chain by extracting single-coverage chains
        from the blastz-based net (chainNetSubset).
        Do this in the the OLD assembly bed/blastz.<assembly> directory.
    - Save files in the vs<NEW assembly> download directory for the 
        OLD assembly:
                e.g. goldenPath/hg16/vsMm4/hg16ToMm4.over.chain.gz
    - CAVEAT: cross-species liftOver chains are not adequate for homology
        mapping -- judicious use of other tools (e.g. orthoMap) and
        human intervention will generally be required.
        (This should be mentioned in the download directory README)
    - NOTE: Create a liftOver chain for each blastz-based net that is built


#-------------------------------------------------------------------------
#
# I. Preliminaries.  Set up some environment variables to point to
# data sources and a temporary dir with lots of space:
set tempDir = /mnt/hg12
mkdir -p $tempDir

# II BLAT Alignments.  Blat can be quite fast, but it still does not handle
#  long query sequences especially well.   First you'll need to generate
#  a copy of the genome split into 3kb chunks.  Then run a cluster blat job
#  using the -fastMap option, and then lift up the results:
  
cd $tempDir
ssh kk
mkdir lift raw psl blatRun 
mkdir /scratch/split
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
    faSplit -lift=lift/chr$i.lft size ~/oo/$i/chr$i.fa -oneFile 3000 /scratch/split/chr$i
end
#and sync the cluster local scratch disks.

cd blatRun
echo '#LOOP' > gsub
echo 'blat $(path1) $(path2) {check out line+ ../raw/$(root1)_$(root2).psl} -tileSize=12 -ooc=$HOME/hg/h/12r.ooc -minScore=100 -minIdentity=98 -fastMap' >> gsub
echo '#ENDLOOP' >> gsub
ls -1S /scratch/split/*.fa > new.lst
ls -1S /scratch/hg/hg12/mixedNib/*.nib > old.lst
gensub2 old.lst new.lst gsub spec
para make spec

#Wait for this job to finish.  Then
cd ../raw
foreach i (1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 X Y M)
    liftUp -pslQ ../psl/chr$i.psl ../lift/chr$i.lft warn chr*_chr$i.psl
    echo done $i
end
cd ..
rm -r raw

# III - Converting to net
mkdir chainRun chainRaw chain
cd chainRun
echo '#LOOP' > gsub
echo 'axtChain -psl $(path1) /cluster/store3/gs.13/build30/mixedNib /scratch/hg/gs.14/build31/chromTrfMixedNib {check out line+ ../chainRaw/$(root1).chain}' >> gsub
echo '#ENDLOOP' >> gsub
ls -1S ../psl/*.psl > in.lst
gensub2 in.lst single gsub spec
para make spec
#Wait for this to finish, then

cd ..
chainMergeSort chainRaw/*.chain | chainSplit chain stdin
mkdir net
cd chain
foreach i (*.chain)
    chainNet $i ~/lastOo/chrom.sizes ~/oo/chrom.sizes ../net/$i:r.net /dev/null
    echo done $i
end

# IV - Creating over.chain
mkdir ../over
foreach i (*.chain)
    netChainSubset ../net/$i:r.net $i ../over/$i
    echo done $i
end
mkdir ~/oo/bedOver
cat ../over/*.chain ~/oo/bedOver/over.chain

# V - Lifting Ensembl:
# Get onto hgwdev
cd ~/oo/bedOver
mkdir ensembl
cd ensembl
hgsql -N hg12 > old.gp <<end
select * from ensGene
end
liftOver -genePred old.gp ../over.chain new.gp unmapped.gp


