# for emacs: -*- mode: sh; -*-

#	Rabbit
# Oryctolagus cuniculus from Broad, version oryCun2 (released Apr 2009)
#    Project website:
#    ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rabbit/oryCun2/

############################################################################
# Download files from Broad (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2
    cd /hive/data/genomes/oryCun2
    mkdir broad
    cd broad
    wget --timestamping \
"ftp://ftp.broad.mit.edu/pub/assemblies/mammals/rabbit/oryCun2/*"

    #	fixup the quality scores
    qaToQac assembly.quals.gz stdout \
	| qacAgpLift Chromosomes.agp stdin chromosomes.qual.qac

############################################################################
# Build browser (DONE - 2009-08-11 - Hiram)
    cd /hive/data/genomes/oryCun2
    cat << '_EOF_' > oryCun2.contig.ra
# Config parameters for makeGenomeDb.pl:
db oryCun2
clade vertebrate
scientificName Oryctolagus cuniculus
commonName Rabbit
assemblyDate Apr. 2009
assemblyLabel Broad Institute oryCun2 (NCBI project 12819, AAGW00000000)
orderKey 189
mitoAcc NC_001913
fastaFiles /hive/data/genomes/oryCun2/broad/assembly.bases.gz
agpFiles /hive/data/genomes/oryCun2/broad/Chromosomes.agp
qualFiles /hive/data/genomes/oryCun2/broad/chromosomes.qual.qac
dbDbSpeciesDir rabbit
taxId 9986
'_EOF_'
    # << happy emacs

    #	verify sequence is OK before going on
    makeGenomeDb.pl -stop=agp oryCun2.config.ra > agp.log 2>&1
    #	continuing
    makeGenomeDb.pl -noGoldGapSplit -continue=db oryCun2.config.ra \
	> db.log 2>&1

##########################################################################
# Repeat Masker (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/repeatMasker
    cd /hive/data/genomes/oryCun2/bed/repeatMasker
    doRepeatMasker.pl -verbose=2 -workhorse=hgwdev \
	-noSplit -buildDir=`pwd` oryCun2 > do.log 2>&1 &
    cat faSize.rmsk.txt
# 2737490501 bases (133467217 N's 2604023284 real 1466215749 upper
#	1137807535 lower) in 3242 sequences in 1 files
# %41.56 masked total, %43.69 masked real

########################################################################
# Simple Repeats (DONE - 2009-08-11 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/simpleRepeat
    cd /hive/data/genomes/oryCun2/bed/simpleRepeat
    doSimpleRepeat.pl -workhorse=hgwdev \
	-buildDir=`pwd` oryCun2 > do.log 2>&1 &
    #	fails on the job for chrM, make an empty result:
    touch /hive/data/genomes/oryCun2/TrfPart/009/009.lst.bed
    doSimpleRepeat.pl -workhorse=hgwdev -continue=filter \
	-buildDir=`pwd` oryCun2 > filter.log 2>&1 &
    #	about 13 minutes
    cat fb.simpleRepeat 
    #	11549259 bases of 332311746 (3.475%) in intersection

########################################################################
# Create a ctgPos2 table to show scaffolds (DONE - 2009-08-11 - Hiram)
    cd /hive/data/genomes/oryCun2/broad
    cat << '_EOF_' > mkCtgPos2.pl
#!/usr/bin/env perl

use strict;
use warnings;

my %scafStart;  # key is scaffold name, value is scaffold start coordinate
my %scafEnd;    # key is scaffold name, value is scaffold end coordinate
my %ctgStart;   # key is scaffold name, value is ctg name where it starts
my %ctgEnd;     # key is scaffold name, value is ctg name where it ends
my @scafNames;
my $scafCount = 0;

my $prevCtg = "";
my $prevEnd = 0;
my $scafName = "";

open (FH, "<assembly.agp") or die "can not read assembly.agp";
while (my $line = <FH>) {
    my ($name, $start, $end, $id, $type, $ctg, $rest) = split('\t', $line, 7);
    if (length ($scafName) > 0) {
        if ($scafName ne $name) {
            $scafEnd{$scafName} = $prevEnd;
            $ctgEnd{$scafName} = $prevCtg;
            $scafName = $name;
            $scafStart{$scafName} = $start;
            $ctgStart{$scafName} = $ctg;
            $scafEnd{$scafName} = $end;
            $ctgEnd{$scafName} = $ctg;
            $scafNames[$scafCount++] = $scafName;
        }
    } else {
        $scafName = $name;
        $scafStart{$scafName} = $start;
        $ctgStart{$scafName} = $ctg;
        $scafEnd{$scafName} = $end;
        $ctgEnd{$scafName} = $ctg;
        $scafNames[$scafCount++] = $scafName;
    }
    $prevCtg = $ctg;
    $prevEnd = $end;
}
close (FH);
$scafEnd{$scafName} = $prevEnd;
$ctgEnd{$scafName} = $prevCtg;

printf STDERR "working with $scafCount scaffolds\n";
printf STDERR "first one: $scafNames[0], last one: $scafNames[$scafCount-1]\n";

my %ctgsChr;    # key is ctg name, value is chrom name
my %ctgsStart;  # key is ctg name, value is chrom start
my %ctgsEnd;    # key is ctg name, value is chrom end

open (FH, "<Chromosomes.agp") or die "can not read Chromosomes.agp";
while (my $line = <FH>) {
    my ($chr, $start, $end, $id, $type, $ctg, $rest) = split('\t', $line, 7);
    if ($ctg =~ m/^contig_/) {
        $ctgsChr{$ctg} = $chr;
        $ctgsStart{$ctg} = $start-1;
        $ctgsEnd{$ctg} = $end;
    }
}
close (FH);

for (my $i = 0; $i < $scafCount; ++$i) {
    my $scafName = $scafNames[$i];
    my $startCtg = $ctgStart{$scafName};
    my $endCtg = $ctgEnd{$scafName};
    my $size = $ctgsEnd{$endCtg} - $ctgsStart{$startCtg};
    die "ERROR: non matching chr name between start and end ctg"
        if ($ctgsChr{$startCtg} ne $ctgsChr{$endCtg});
    if ($size < 0) {
        $size = $ctgsEnd{$startCtg} - $ctgsStart{$endCtg};
        printf "%s\t%d\t%s\t%d\t%d\tW\n", $scafName, $size,
            $ctgsChr{$startCtg}, $ctgsStart{$endCtg}, $ctgsEnd{$startCtg};
    } else {
        printf "%s\t%d\t%s\t%d\t%d\tW\n", $scafName, $size,
            $ctgsChr{$startCtg}, $ctgsStart{$startCtg}, $ctgsEnd{$endCtg};
    }

}
'_EOF_'
    # << happy emacs
    chmod +x mkCtgPos2.pl
    ./mkCtgPos2.pl > ctgPos2.tab
    
    hgLoadSqlTab oryCun2 ctgPos2 $HOME/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab

########################################################################
# Checking *all* gaps - are they all mentioned in the AGP file ?
#	(DONE - 2009-08-12 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/allGaps
    time nice -n +19 findMotif -motif=gattaca -verbose=4 \
	-strand=+ ../../oryCun2.2bit > findMotif.txt 2>&1
    #	real    0m7.967s
    grep "^#GAP " findMotif.txt | sed -e "s/^#GAP //" > allGaps.bed
    featureBits oryCun2 -not gap -bed=notGap.bed
    featureBits oryCun2 allGaps.bed notGap.bed -bed=new.gaps.bed
    #	0 bases of 2604023284 (0.000%) in intersection
    # all N's in the sequence are marked as gap, do not need to add anything
    #	to the gap table (see also: tetNig2.txt for procedure)

########################################################################
# Add simple repeats to RM masked sequence (DONE - 2009-08-12 - Hiram)
    cd /hive/data/genomes/oryCun2
    twoBitMask oryCun2.rmsk.2bit -add bed/simpleRepeat/trfMask.bed oryCun2.2bit
    rm /gbdb/oryCun2/oryCun2.2bit
    ln -s `pwd`/oryCun2.2bit /gbdb/oryCun2/oryCun2.2bit
    twoBitToFa  oryCun2.2bit stdout \
	| faSize stdin > oryCun2.2bit.faSize.txt 2>&1
    #	2737490501 bases (133467217 N's 2604023284 real 1465050403 upper
    #	1138972881 lower) in 3242 sequences in 1 files
    #	%41.61 masked total, %43.74 masked real

########################################################################
# create ooc file and kluster data files (DONE - 2009-08-12 - Hiram)
    # calculate rabbit/human size ratio to determine repMatch count
	   oryCun2 real / hg19 real
920 = 1024 * 2604023284 / 2897310462
    blat oryCun2.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/oryCun2.11.ooc -repMatch=920
    #	Wrote 32543 overused 11-mers to jkStuff/oryCun2.11.ooc

    #	create non-bridged contigs lift and sequence
    mkdir /hive/data/genomes/oryCun2/contigs
    cd /hive/data/genomes/oryCun2/contigs
    gapToLift -verbose=2 oryCun2 oryCun2.contigs.lift \
	-bedFile=oryCun2.contigs.bed
    ~/kent/src/hg/utils/lft2BitToFa.pl ../oryCun2.2bit oryCun2.contigs.lift \
	| gzip -c > oryCun2.contigs.fa.gz
    #	verify nothing lost
    faSize *.fa.gz > contigs.faSize.txt 2>&1
    #	2671429501 bases (67406217 N's 2604023284 real 1465050403 upper
    #	1138972881 lower) in 3319 sequences in 1 files
    #	%42.64 masked total, %43.74 masked real
    # only total and N count different, no sequence lost
    faToTwoBit oryCun2.contigs.fa.gz oryCun2.contigs.2bit
    twoBitInfo oryCun2.contigs.2bit stdout | sort -k2rn > oryCun2.contigs.sizes

    cd /hive/data/genomes/oryCun2
    mkdir /hive/data/staging/oryCun2
    cp -p jkStuff/oryCun2.11.ooc oryCun2.2bit \
	chrom.sizes contigs/oryCun2.contigs.2bit \
	contigs/oryCun2.contigs.sizes /hive/data/staging/oryCun2

########################################################################
# lastz swap from Human Hg19 (DONE - 2009-08-27 - Hiram)
    # original alignment
    cd /hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12
    cat fb.hg19.chainOryCun2Link.txt 
    #	1283994337 bases of 2897316137 (44.317%) in intersection

    mkdir /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.hg19.swap
    
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/hg19/bed/lastzOryCun2.2009-08-12/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	-swap -syntenicNet > swap.log 2>&1 &
    #	real    176m35.932s
    cat fb.oryCun2.chainHg19Link.txt 
    #	1260477501 bases of 2604023284 (48.405%) in intersection

##############################################################################
############################################################################
# TRANSMAP vertebrate.2009-09-13 build  (2009-09-20 markd)

vertebrate-wide transMap alignments were built  Tracks are created and loaded
by a single Makefile. This is available from:
   svn+ssh://hgwdev.cse.ucsc.edu/projects/compbio/usr/markd/svn/projs/transMap/tags/vertebrate.2009-09-13

see doc/builds.txt for specific details.
############################################################################
# GENBANK AUTO UPDATE (DONE - 2009-09-25 - Hiram)
    #	add the following to genbank/etc/genbank.conf just before Orang ponAbe2
# Rabbit
oryCun2.serverGenome = /hive/data/genomes/oryCun2/oryCun2.2bit
oryCun2.clusterGenome = /scratch/data/oryCun2/oryCun2.2bit
oryCun2.ooc = /scratch/data/oryCun2/oryCun2.11.ooc
oryCun2.lift = /hive/data/genomes/oryCun2/contigs/oryCun2.contigs.lift
oryCun2.perChromTables = no
oryCun2.refseq.mrna.native.pslCDnaFilter  = ${ordered.refseq.mrna.native.pslCDnaFilter}
oryCun2.refseq.mrna.xeno.pslCDnaFilter    = ${ordered.refseq.mrna.xeno.pslCDnaFilter}
oryCun2.genbank.mrna.native.pslCDnaFilter = ${ordered.genbank.mrna.native.pslCDnaFilter}
oryCun2.genbank.mrna.xeno.pslCDnaFilter   = ${ordered.genbank.mrna.xeno.pslCDnaFilter}
oryCun2.genbank.est.native.pslCDnaFilter  = ${ordered.genbank.est.native.pslCDnaFilter}
oryCun2.genbank.est.xeno.pslCDnaFilter    = ${ordered.genbank.est.xeno.pslCDnaFilter}
oryCun2.downloadDir = oryCun2
oryCun2.refseq.mrna.native.load  = yes
oryCun2.refseq.mrna.xeno.load = yes
oryCun2.refseq.mrna.xeno.loadDesc  = yes

    # Edit src/lib/gbGenome.c to add new species.  With these two lines:
    #	static char *papHamNames[] = {"Papio hamadryas",
    #		"Papio hamadryas hamadryas", NULL};
    #	{"papHam", papHamNames},
# Oryctolagus cuniculus from Broad, version oryCun2 (released Apr 2009)

    cvs ci -m "Adding Baboon Papio hamadryas hamadryas" src/lib/gbGenome.c
    make install-server

    ssh genbank
    screen  # control this business with a screen since it takes a while
    cd /cluster/data/genbank

    time nice -n +19 bin/gbAlignStep -initial oryCun2 &
    #	logFile: var/build/logs/2009.09.25-14:55:27.oryCun2.initalign.log
    #	real    163m41.298s
    #	something when haywire in the swarm batch run, finished
    #	that manually, then, continuing:
    time nice -n +19 bin/gbAlignStep -continue=finish -initial oryCun2 &
    #	var/build/logs/2009.09.28-10:46:25.oryCun2.initalign.log
    #	real    76m41.952s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad oryCun2
    #	var/dbload/hgwdev/logs/2009.09.28-12:15:42.dbload.log
    #	real    32m14.821s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add oryCun2 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added oryCun2 - Oryctolagus cuniculus - Rabbit" \
	etc/align.dbs etc/hgwdev.dbs
    make etc-update
    #	done - 2009-09-28 - Hiram

############################################################################
#	update defaultDb and default position to RHO refSeq gene
    hgsql hgcentraltest \
    	-e 'update defaultDb set name="oryCun2" where genome="Rabbit";'
    hgsql -e \
'update dbDb set defaultPos="chr9:11026346-11032288" where name="oryCun2";' \
	hgcentraltest

############################################################################
# cavPor3 Guinea Pig alignment (DONE - 2010-01-21 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21
    cd /hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21

    cat << '_EOF_' > DEF
# Rabbit vs. Guinea Pig

# TARGET: Rabbit oryCun2
SEQ1_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ1_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Guinea Pig
SEQ2_DIR=/scratch/data/cavPor3/cavPor3.2bit
SEQ2_LEN=/scratch/data/cavPor3/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LIMIT=50
SEQ2_LAP=0

BASE=/hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=pk -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > do.log 2>&1 &
    #	real    1132m45.207s
    cat fb.oryCun2.chainCavPor3Link.txt 
    #	964546600 bases of 2604023284 (37.041%) in intersection

    mkdir /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    cd /hive/data/genomes/cavPor3/bed/blastz.oryCun2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryCun2/bed/lastzCavPor3.2010-01-21/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -bigClusterHub=swarm \
	-chainMinScore=3000 -chainLinearGap=medium > swap.log 2>&1 &
    #	real    186m1.649s
    cat fb.cavPor3.chainOryCun2Link.txt 
    #	1003499831 bases of 2663369733 (37.678%) in intersection

############################################################################
# monDom5 Opossum alignment (DONE - 2010-01-21 - Hiram)
    mkdir /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21
    cd /hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21

    cat << '_EOF_' > DEF
# Rabbit vs. Opossum

# TARGET: Rabbit oryCun2
SEQ1_DIR=/scratch/data/oryCun2/oryCun2.2bit
SEQ1_LEN=/scratch/data/oryCun2/chrom.sizes
SEQ1_CHUNK=10000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Opossum
SEQ2_DIR=/scratch/data/monDom5/monDom5.2bit
SEQ2_LEN=/scratch/data/monDom5/chrom.sizes
SEQ2_CHUNK=10000000
SEQ2_LAP=0

BASE=/hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	`pwd`/DEF \
	-noLoadChainSplit -syntenicNet \
	-workhorse=hgwdev -smallClusterHub=memk -bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=loose > do.log 2>&1 &
    #	real    1791m17.275s
    cat fb.oryCun2.chainMonDom5Link.txt 
    #	176460344 bases of 2604023284 (6.776%) in intersection

    mkdir /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap
    cd /hive/data/genomes/monDom5/bed/blastz.oryCun2.swap
    time doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/oryCun2/bed/lastzMonDom5.2010-01-21/DEF \
	-swap -noLoadChainSplit -workhorse=hgwdev -smallClusterHub=memk \
	-bigClusterHub=swarm \
	-chainMinScore=5000 -chainLinearGap=loose > swap.log 2>&1 &
    #	real    74m20.206s
    cat fb.monDom5.chainOryCun2Link.txt 
    #	178331930 bases of 3501660299 (5.093%) in intersection

############################################################################
# all.joiner update - (DONE - 2010-04-02 - Hiram)
    cd $HOME/kent/src/hg/makeDb/schema
    # fixup all.joiner until this is a clean output
    joinerCheck -database=oryCun2 -all all.joiner

    mkdir /hive/data/genomes/oryCun2/goldenPath
    cd /hive/data/genomes/oryCun2/goldenPath
    makeDownloads.pl oryCun2 > do.log 2>&1
    #	edit the README files to indicate reality and take a look
    #	to verify expected files are present

    #	now ready for pushQ entry
    mkdir /hive/data/genomes/oryCun2/pushQ
    cd /hive/data/genomes/oryCun2/pushQ
    makePushQSql.pl oryCun2 > oryCun2.pushQ.sql 2> stderr.out
    #	copy it to hgwbeta
    scp -p oryCun2.pushQ.sql hgwbeta:/tmp
    ssh hgwbeta
    cd /tmp
    hgsql qapushq < oryCun2.pushQ.sql
    #	in that pushQ entry walk through each entry and see if the
    #	sizes will set properly

############################################################################
# lift file to Ensembl scaffold coordinates (DONE - 2010-04-02 - Hiram)
    cd /hive/data/genomes/oryCun2/broad
    cat << '_EOF_' > ctgPos2ToEnsembl.pl
#!/usr/bin/env perl

my %chromSize;
open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+',$line);
    $chromSize{$chr} = $size;
}
close (FH);

open (FH, "<ctgPos2.tab") or die "can not read ctgPos2.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($scaf, $size, $chr, $start, $end, $type) = split('\s+', $line);
    printf "%s\t%s\t%s\t%s\t%s\n", $start, $scaf, $size, $chr, $chromSize{$chr};
}
close (FH);
'_EOF_'
    # << happy emacs
    chmod +x ctgPos2ToEnsembl.pl
    ./ctgPos2ToEnsembl.pl | sort -k4,4 -k3,3n > ../jkStuff/ensGene.lft

############################################################################
# lastz Mouse Mm9 swap (DONE - 2009-08-26 - Hiram)
    #	original alignment
    cd /hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15
    cat fb.mm9.chainOryCun2Link.txt
# 670229789 bases of 2620346127 (25.578%) in intersection

    #	and for the swap
    mkdir /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    cd /hive/data/genomes/oryCun2/bed/blastz.mm9.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/hive/data/genomes/mm9/bed/lastzOryCun2.2010-01-15/DEF \
	-noLoadChainSplit -chainMinScore=3000 -chainLinearGap=medium \
	-swap -workhorse=hgwdev -smallClusterHub=encodek -bigClusterHub=swarm \
	> swap.log 2>&1 &
    #	real    84m6.571s
    cat fb.oryCun2.chainMm9Link.txt 
    #	669602734 bases of 2604023284 (25.714%) in intersection

############################################################################
# HUMAN (hg18) PROTEINS TRACK (DONE braney 2010-04-28)
    # bash  if not using bash shell already

    cd /cluster/data/oryCun2
    mkdir /cluster/data/oryCun2/blastDb

    awk '{if ($2 > 1000000) print $1}' chrom.sizes > 1meg.lst
    twoBitToFa -seqList=1meg.lst  oryCun2.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    rm temp.fa 1meg.lst

    awk '{if ($2 <= 1000000) print $1}' chrom.sizes > less1meg.lst
    twoBitToFa -seqList=less1meg.lst  oryCun2.2bit temp.fa
    faSplit about temp.fa 1000000 blastDb/y 
    rm temp.fa less1meg.lst

    cd blastDb
    for i in *.fa
    do
	/hive/data/outside/blast229/formatdb -i $i -p F
    done
    rm *.fa
    ls *.nsq | wc -l
# 3326

    mkdir -p /cluster/data/oryCun2/bed/tblastn.hg18KG
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    echo  ../../blastDb/*.nsq | xargs ls -S | sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 3326 query.lst


   # we want around 350000 jobs
   calc `wc /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl | awk '{print $1}'`/\(350000/`wc query.lst | awk '{print $1}'`\)

# 36727/(350000/3326) = 349.011434


   mkdir -p kgfa
   split -l 349 /cluster/data/hg18/bed/blat.hg18KG/hg18KG.psl  kgfa/kg
   cd kgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S kgfa/*.fa > kg.lst
   wc kg.lst
#  106  106 1378 kg.lst

   mkdir -p blastOut
   for i in `cat kg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   tcsh
   cd /cluster/data/oryCun2/bed/tblastn.hg18KG
   cat << '_EOF_' > blastGsub
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/hive/data/outside/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /hive/data/outside/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/oryCun2/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/hg18/bed/blat.hg18KG/protein.lft warn $f.3
        if pslCheck -prot $3.tmp
        then                  
            mv $3.tmp $3     
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0               
    fi                      
fi                         
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome
    exit 
    
    ssh swarm
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    gensub2 query.lst kg.lst blastGsub blastSpec
    para create blastSpec
#    para try, check, push, check etc.

    para time
# Completed: 352556 of 352556 jobs
# CPU time in finished jobs:   15911919s  265198.66m  4419.98h  184.17d  0.505 y
# IO & Wait Time:               2044688s   34078.13m   567.97h   23.67d  0.065 y
# Average job time:                  51s       0.85m     0.01h    0.00d
# Longest finished job:             257s       4.28m     0.07h    0.00d
# Submission to last job:         22838s     380.63m     6.34h    0.26d

    ssh swarm
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    mkdir chainRun
    cd chainRun
    tcsh
    cat << '_EOF_' > chainGsub
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=150000 stdin ../c.`basename $1`.psl)
'_EOF_'
    chmod +x chainOne
    ls -1dS ../blastOut/kg?? > chain.lst
    gensub2 chain.lst single chainGsub chainSpec
    # do the cluster run for chaining
    para create chainSpec
    para try, check, push, check etc.

# Completed: 106 of 106 jobs
# CPU time in finished jobs:     659954s   10999.23m   183.32h    7.64d  0.021 y
# IO & Wait Time:                 79322s    1322.03m    22.03h    0.92d  0.003 y
# Average job time:                6974s     116.24m     1.94h    0.08d
# Longest finished job:           34335s     572.25m     9.54h    0.40d
# Submission to last job:         34345s     572.42m     9.54h    0.40d

    cd /cluster/data/oryCun2/bed/tblastn.hg18KG/blastOut
    for i in kg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort u.*.psl m60* | uniq | sort -T /tmp -k 14,14 -k 16,16n -k 17,17n > ../blastHg18KG.psl
    cd ..
    pslCheck blastHg18KG.psl
# checked: 97569 failed: 0 errors: 0

    # load table 
    ssh hgwdev
    cd /cluster/data/oryCun2/bed/tblastn.hg18KG
    hgLoadPsl oryCun2 blastHg18KG.psl

    # check coverage
    featureBits oryCun2 blastHg18KG 
# 42826260 bases of 2604023284 (1.645%) in intersection

    featureBits oryCun2 blastHg18KG refGene  -enrichment
# blastHg18KG 1.645%, refGene 0.076%, both 0.048%, cover 2.95%, enrich 38.75x

    featureBits oryCun2 blastHg18KG xenoRefGene  -enrichment
# blastHg18KG 1.645%, xenoRefGene 1.837%, both 0.978%, cover 59.48%, enrich 32.37x 

    rm -rf blastOut
#end tblastn

############################################################################
# construct ucscToEnsembl chrom name translation (2011-04-21 - Hiram)
    # the Ensembl genes v62 have new chrom names, construct translation:
    mkdir /hive/data/genomes/oryCun2/bed/ucscToEnsembl
    cd /hive/data/genomes/oryCun2/bed/ucscToEnsembl

grep "^chr" /hive/data/genomes/oryCun2/genbank/Primary_Assembly/localID2acc \
    | sort > genbank.names
cut -f1 genbank.names | sort > findChrNames

cut -f1 ../../chrom.sizes | sort > ucsc.names

join ucsc.names genbank.names | sed -e 's/\.1$//' > ens.names

comm -23 ucsc.names findChrNames | grep -v chrM | while read C
do
    ensName=${C/chr/}
    echo -e "${C}\t${ensName}"
done >> ens.names

sort ens.names > ucscToEnsembl.tab

# find length of ucsc names for key length in SQL below:
awk '{print length($1)}' ucscToEnsembl.tab | sort -u
4
5
9

    cat << '_EOF_' > ucscToEnsembl.sql
# UCSC to Ensembl chr name translation
CREATE TABLE ucscToEnsembl (
    ucsc varchar(255) not null,        # UCSC chromosome name
    ensembl varchar(255) not null,     # Ensembl chromosome name
              #Indices
    PRIMARY KEY(ucsc(9))
);
'_EOF_'

    hgsql oryCun2 < ucscToEnsembl.sql
    hgsql oryCun2 \
-e 'LOAD DATA LOCAL INFILE "ucscToEnsembl.tab" INTO TABLE ucscToEnsembl'

############################################################################
#  oryCun2 - Rabbit - Ensembl Genes version 62  (DONE - 2011-04-21 - hiram)
    ssh hgwdev
    cd /hive/data/genomes/oryCun2/jkStuff
    # construct new lift file with new Ensembl contig names

    cat << '_EOF_' > ens62Lift.pl
#!/bin/env perl

use strict;
use warnings;

my %chrSize;

open (FH, "<../chrom.sizes") or die "can not read ../chrom.sizes";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $size) = split('\s+', $line);
    $chrSize{$chr} = $size;
}
close (FH);

my %ensName;
open (FH, "<../bed/ucscToEnsembl/ucscToEnsembl.tab") or
        die "can not read ../bed/ucscToEnsembl/ucscToEnsembl.tab";
while (my $line = <FH>) {
    chomp $line;
    my ($chr, $name) = split('\s+', $line);
    $ensName{$chr} = $name;
}
close (FH);

foreach my $chr (keys %ensName) {
    printf "0\t%s\t%d\t%s\t%d\n", $ensName{$chr}, $chrSize{$chr}, $chr, $chrSize{$chr};
}
'_EOF_'
    # << happy emacs
    chmod +x ens62Lift.pl
    ./ens62Lift.pl > ens.62.lft

    # now, using that in the ensGene definition

    cd /hive/data/genomes/oryCun2
    cat << '_EOF_' > oryCun2.ensGene.ra
# required db variable
db oryCun2
# the lift file will change the chrom names, no nameTranslation needed
# nameTranslation "s/^/chr/;"
# ensembl v62 has new naming scheme based on NCBI release:
liftUp /hive/data/genomes/oryCun2/jkStuff/ens.62.lft
'_EOF_'
#  << happy emacs

    doEnsGeneUpdate.pl  -ensVersion=62 oryCun2.ensGene.ra
    ssh hgwdev
    cd /hive/data/genomes/oryCun2/bed/ensGene.62
    featureBits oryCun2 ensGene
    # 31797207 bases of 2604023284 (1.221%) in intersection

#############################################################################
# N-SCAN gene predictions (nscanGene) - (2011-05-02 markd)

/cluster/home/jeltje/hive/rabbit/oryCun2.prot.fa
/cluster/home/jeltje/hive/rabbit/chr_gtf/chr*.fa

    # obtained NSCAN predictions from Jeltje van Baren <jeltje@mbari.org>,
    cd /hive/data/genomes/oryCun2/bed/nscan
    cp /cluster/home/jeltje/hive/rabbit/oryCun2.prot.fa  .
    cp -r /cluster/home/jeltje/hive/rabbit/chr_gtf .
    gzip oryCun2.* chr_gtf/*
    chmod a-w oryCun2.* chr_gtf/*

    # load track
    zcat chr_gtf/*.gtf.gz | gtfToGenePred -genePredExt stdin stdout | hgLoadGenePred -genePredExt oryCun2 nscanGene stdin
    hgPepPred oryCun2 generic nscanPep oryCun2.prot.fa.gz
    rm *.tab

    # rabbit/oryCun2/trackDb.ra, add:
    track nscanGene override
    informant Rabbit N-SCAN used human (hg19) as the informant for conservation.

    # verify top-level search spec, should produce no results an an error for egrep:
    hgsql -Ne 'select name from nscanGene' oryCun2 | egrep -v '^chr[0-9a-zA-Z_]+\.([0-9]+|pasa)((\.[0-9a-z]+)?\.[0-9a-z]+)?$' |head

############################################################################
