# for emacs: -*- mode: sh; -*-

#	Caenorhabditis PB2801
#	This is the actual name, it is sometimes known as:
#	Caenorhabditis n. sp. PB2801
#	From Wash U GSC
#	http://genome.wustl.edu/genome.cgi?GENOME=Caenorhabditis+n.+sp.+PB2801


#########################################################################
# DOWNLOAD SEQUENCE (DONE - 2007-02-16 - Hiram)
    ssh kkstore01
    mkdir /cluster/store10/caePb1
    ln -s /cluster/store10/caePb1 /cluster/data/caePb1

    cd /cluster/data/caePb1
    mkdir downloads
    cd downloads
    wget --timestamping \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/ASSEMBLY"

    wget --timestamping \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/README" -O README.caePb1

    wget --timestamping -m -nd -np \
"ftp://genome.wustl.edu/pub/organism/Invertebrates/Caenorhabditis_PB2801/assembly/draft/Caenorhabditis_PB2801-4.0/output/"

    #	Going to manipulate the given supercontigs.agp.gz file to produce a
    #	chrUn.agp.gz file, although this ends up being too detailed.  A second
    #	super.lift and gold.tab will also be produced to get the chrUn_gold
    #	table loaded with appropriate supercontigs without all the bridged
    #	gaps.  This first chrUn.agp.gz file will be used with the initial
    #	build so that the gap table will be constructed properly with all the
    #	bridged gaps.
    faCount contigs.fa.gz > faCount.contigs.fa.txt
    mkdir ../jkStuff
    cat << '_EOF_' > ../jkStuff/mkChrUnAgp.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
                    $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $end = $start + $scaffoldGapSize - 1;
                printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
                    $start, $end, $itemCount++, $scaffoldGapSize;
                $start = $end + 1;
            }
        } else {
            $firstTime = 0;
        }
        $end = $start + $ctgLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
                $start, $end, $itemCount++, $WN, $name,
                $cStart, $cEnd, $strand;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
'_EOF_'
# << happy emacs
    chmod +x ../jkStuff/mkChrUnAgp.pl
    ../jkStuff/mkChrUnAgp.pl | gzip > chrUn.agp.gz
    qaToQac contigs.fa.qual.gz stdout \
        | qacAgpLift chrUn.agp.gz stdin chrUn.qac

##########################################################################
## Initial database construction and beginning tables
##	(DONE - 2007-03-01 - Hiram)

    ssh kkstore01
    cd /cluster/data/caePb1
    cat << '_EOF_' > caePb1.config.ra
# Config parameters for makeGenomeDb.pl:
db caePb1
clade worm
genomeCladePriority 66
scientificName Caenorhabditis PB2801
commonName C. PB2801
assemblyDate Jan. 2007
assemblyLabel Washington University School of Medicine GSC Caenorhabditis n. sp. PB2801
orderKey 876
mitoAcc none
fastaFiles /cluster/data/caePb1/downloads/supercontigs.fa.gz
agpFiles /cluster/data/caePb1/downloads/scaffold.agp.gz
qualFiles /cluster/data/caePb1/downloads/chrUn.qac
dbDbSpeciesDir worm
'_EOF_'
# << happy emacs

    time makeGenomeDb.pl -verbose=2 caePb1.config.ra > makeGenomeDb.out 2>&1

##########################################################################
##  Fixup chrUn_gold table, and add ctgPos2 table

    ssh kkstore01
    cd /cluster/data/caePb1/downloads

    #	create a ctgPos2.tab table
    cat << '_EOF_' > ../jkStuff/mkScafAgp.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;
my $scafEnd = 1;

open (CT,">ctgPos2.tab") or die "can not write to ctgPos2.tab";

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        $scafEnd += $gapLen;
        printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
                    $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $end = $start + $scaffoldGapSize - 1;
                printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
                    $start, $end, $itemCount++, $scaffoldGapSize;
                $start = $end + 1;
                $scafEnd = $cStart - 1;
            }
        } else {
            $firstTime = 0;
            $scafEnd = 0;
        }
        $scafEnd += $ctgLen;
        my $fragStart = $scafEnd - $ctgLen + 1;
        $end = $start + $ctgLen - 1;
        printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
                $start, $end, $itemCount++, $WN, $ctgName,
                $fragStart, $scafEnd, $strand;
        printf CT "%s\t%d\tchrUn\t%d\t%d\tW\n",
                $name, $ctgLen, $start-1, $end;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
close (CT);
'_EOF_"
# << happy emacs
    chmod +x ../jkStuff/mkScafAgp.pl
    nice -n +19 ../jkStuff/mkScafAgp.pl

    ##	and load that ctgPos2 table:
    ssh hgwdev
    cd /cluster/data/caePb1/downloads
    hgLoadSqlTab caePb1 ctgPos2 ~/kent/src/hg/lib/ctgPos2.sql ctgPos2.tab

    ##  and the super.lift to get supercontigs lifted to chrUn
    ssh kkstore01
    cd /cluster/data/caePb1/downloads
    
    cat << '_EOF_' > ../jkStuff/mkSuperLift.pl
#!/usr/bin/env perl

use strict;
use warnings;

my $start = 1;
my $end = 1;
my $itemCount = 1;
my $curContig = "";
my $firstTime = 1;
my $scaffoldGapSize = 1000;
my $scafEnd = 1;
my $superStart = 0;
my $superEnd = 1;
my $superLength = 0;
my $chrUnSize = 207333000;
my $chrUnName = "chrUn";

open (GL,">gold.tab") or die "Can not write to gold.tab";

open (FH,'zcat supercontigs.agp.gz|') or
        die "can not open zcat supercontigs.agp.gz";
while (my $line=<FH>) {
    chomp $line;
    if ($line =~ m/fragment/) {
        my ($name, $gStart, $gEnd, $counter, $WN, $gapLen, $frag, $yesNo) =
            split('\s+',$line);
        $end = $start + $gapLen - 1;
        $scafEnd += $gapLen;
#       printf "chrUn\t%d\t%d\t%d\t%s\t%d\t%s\t%s\n",
#                   $start, $end, $itemCount++, $WN, $gapLen, $frag, $yesNo;
        $start = $end + 1;
    } else {
    my ($ctgName, $ctgStart, $ctgEnd, $counter, $WN, $name, $cStart, $cEnd, $strand) =
        split('\s+',$line);
        my $ctgLen = $ctgEnd - $ctgStart + 1;
        my $cLen = $cEnd - $cStart + 1;
        die "not matching start, end:\n$line" if ($ctgLen != $cLen);
        if (!$firstTime) {
            if ($ctgName ne $curContig) {
                $superLength = $superEnd - $superStart;
                $end = $start + $scaffoldGapSize - 1;
#               printf "chrUn\t%d\t%d\t%d\tN\t%d\tscaffold\tno\n",
#                   $start, $end, $itemCount++, $scaffoldGapSize;
                printf "%d\t%s\t%d\t%s\t%d\n",
                        $superStart, $curContig, $superLength, $chrUnName,
                                $chrUnSize;
                printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
                        $chrUnName, $superStart, $superEnd, $itemCount++,
                        $curContig, $superLength;
                $start = $end + 1;
                $scafEnd = $cStart - 1;
                $superStart = $start - 1;
            }
        } else {
            $firstTime = 0;
            $scafEnd = 0;
        }
        $scafEnd += $ctgLen;
        my $fragStart = $scafEnd - $ctgLen + 1;
        $end = $start + $ctgLen - 1;
        $superEnd = $end;
#       printf "chrUn\t%d\t%d\t%d\t%s\t%s\t%d\t%d\t%s\n",
#               $start, $end, $itemCount++, $WN, $ctgName,
#               $fragStart, $scafEnd, $strand;
#       printf SL "%s\t%d\tchrUn\t%d\t%d\tW\n",
#               $name, $ctgLen, $start-1, $end;
        $start = $end + 1;
        $curContig = $ctgName;
    }
}
close (FH);
$superLength = $superEnd - $superStart;
printf "%d\t%s\t%d\t%s\t%d\n",
    $superStart, $curContig, $superLength, $chrUnName, $chrUnSize;
printf GL "%s\t%d\t%d\t%d\tW\t%s\t0\t%d\t+\n",
        $chrUnName, $superStart, $superEnd, $itemCount++,
        $curContig, $superLength;
close (GL);
'_EOF_'
# << happy emacs
    chmod +x ../jkStuff/mkSuperLift.pl

    nice -n +19 ../jkStuff/mkSuperLift.pl > super.lift

    ##	and load that ctgPos2 table:

    ssh hgwdev
    cd /cluster/data/caePb1/downloads
    sed -e "s/agpFrag/chrUn_gold/" $HOME/kent/src/hg/lib/agpFrag.sql \
	> chrUn_gold.sql
    # edit that .sql file to add the bin column
    #	 bin smallint(6) NOT NULL default '0',

    hgLoadBed -sqlTable=chrUn_gold.sql caePb1 chrUn_gold gold.tab

##############################################################################
## Repeat Masker does not recognize given species:
#	Species "Caenorhabditis PB2801" is not known to RepeatMasker.

##############################################################################
## WindowMasker masking (DONE - 2007-03-02 - Hiram)
    ssh kolossus
    mkdir /cluster/data/caePb1/bed/WindowMasker
    cd /cluster/data/caePb1/bed/WindowMasker
    time nice -n +19 ~/kent/src/hg/utils/automation/doWindowMasker.pl \
	caePb1 -buildDir=/cluster/data/caePb1/bed/WindowMasker \
	    -workhorse=kolossus > doWM.out 2>&1 &
    ## load result
    ssh hgwdev
    cd /cluster/data/caePb1/bed/WindowMasker
    hgLoadBed -strict caePb1 windowmaskerSdust windowmasker.sdust.bed.gz

##############################################################################
## simpleRepeat masking (DONE - 2007-03-02 - Hiram)
    ssh kolossus
    mkdir /cluster/data/caePb1/bed/simpleRepeat
    cd /cluster/data/caePb1/bed/simpleRepeat
    nice -n +19 twoBitToFa ../../caePb1.unmasked.2bit stdout \
	| trfBig -trf=/cluster/bin/i386/trf stdin /dev/null \
		-bedAt=simpleRepeat.bed -tempDir=/scratch/tmp
    awk '{if ($5 <= 12) print;}' simpleRepeat.bed > trfMask.bed

    ssh hgwdev
    cd /cluster/data/caePb1/bed/simpleRepeat
    nice -n +19 hgLoadBed caePb1 simpleRepeat \
      simpleRepeat.bed -sqlTable=$HOME/kent/src/hg/lib/simpleRepeat.sql

    featureBits caePb1 simpleRepeat
    #	4938308 bases of 175247318 (2.818%) in intersection

##############################################################################
## combine trf mask with windowmasker (DONE - 2007-03-03 - Hiram)
    ssh kkstore02
    cd /cluster/data/caePb1
    cat << '_EOF_' > addTrf.csh
#!/bin/csh -efx
# This script will fail if any of its commands fail.

set DB = caePb1
set WORK_DIR = /cluster/data/${DB}
set WM_DIR = /cluster/data/${DB}/bed/WindowMasker
cd ${WORK_DIR}
set inputTwoBit = ${WM_DIR}/${DB}.wmsk.sdust.2bit
set tmpTwoBit = ${WM_DIR}/${DB}.sdTrf.n.2bit
set outputTwoBit = ${WORK_DIR}/${DB}.sdTrf.2bit
cat /cluster/data/${DB}/bed/simpleRepeat/trfMask.bed \
        | twoBitMask -add -type=.bed ${inputTwoBit} stdin ${tmpTwoBit}
twoBitToFa ${tmpTwoBit} stdout | sed -e "s/n/N/g" \
        | sed -e "s/chrUN/chrUn/" | faToTwoBit stdin ${outputTwoBit}
twoBitToFa ${outputTwoBit} stdout | faSize -showPercent stdin \
        > faSize.${DB}.sdTrf.txt
rm -f ${tmpTwoBit}
cat faSize.${DB}.sdTrf.txt
'_EOF_'
# << happy emacs
    chmod +x ./addTrf.csh
    nice -n +19 ./addTrf.csh
    #	produces this masked file:
    # 	-rw-rw-r--  1 63185508 Mar  3 09:18 caePb1.sdTrf.2bit
    #	faSize -showPercent on it indicates:
    #	207333000 bases (32085682 N's 175247318 real
    #	124302731 upper 50944587 lower) in 1 sequences in 1 files
    #	%24.57 masked total, %29.07 masked real

##############################################################################
## create masked contigs for blastz (DONE - 2007-03-04 - Hiram)
    ssh kkstore02
    mkdir /cluster/data/caePb1/maskedContigs
    cd /cluster/data/caePb1/maskedContigs
    ln -s ../downloads/super.lift .
    #	copy over lft2BitToFa.pl from /cluster/data/gasAcu1/jkStuff
    ../jkStuff/lft2BitToFa.pl ../caePb1.2bit super.lift > super.fa
    faToTwoBit super.fa caePb1.supercontigs.sdTrf.2bit
    twoBitInfo caePb1.supercontigs.sdTrf.2bit stdout | sort -k2nr \
        > caePb1.supercontigs.sizes
    #	Copy this 2bit file and sizes over to /san/sanvol1/scratch/worms/caePb1

    #  the lft2BitToFa.pl script is:
    cat << '_EOF_'
#!/usr/bin/env perl

use strict;
use warnings;
use File::Basename;
    
sub usage()
{
printf "usage: %s <file.2bit> <file.lft> [more_files.lft]\n",  basename($0);
printf "\tfasta output is to stdout, therefore route stdout to result file\n";
exit 255;
}   
    
my $argc = scalar(@ARGV);

usage if ($argc < 2);

my $twoBitFile = shift;
    
while (my $liftFile = shift)
{   
open (FH,"<$liftFile") or die "Can not open $liftFile";
while (my $line=<FH>)
    {
    chomp $line;
    my ($start, $contig, $length, $chrom, $chrom_length) = split('\s',$line);
    my $cmd=sprintf("twoBitToFa $twoBitFile:%s:%d-%d stdout",
        $chrom, $start, $start+$length);
    print `$cmd | sed -e "s#^>.*#>$contig#"`;
    }
close (FH);
}
'_EOF_'
# << happy emacs

############################################################################
## Default position (DONE - 2007-04-09 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:123456-234567"
	where name="caePb1";' hgcentraltest

############################################################################
## SWAP BLASTZ cb3 alignments (DONE - 2007-04-18 - Hiram)
    ssh kkstore01
    cd /cluster/data/cb3/bed/blastz.caePb1.2007-04-18
    cat fb.cb3.chainCaePb1Link.txt
    #	42772225 bases of 108433446 (39.446%) in intersection

    mkdir /cluster/data/caePb1/bed/blastz.cb3.swap
    cd /cluster/data/caePb1/bed/blastz.cb3.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/cb3/bed/blastz.caePb1.2007-04-18/DEF \
	-qRepeats=windowmaskerSdust -bigClusterHub=kk \
	-swap > swap.log 2>&1 &
    #	real    3m57.854s
    cat fb.caePb1.chainCb3Link.txt
    #	59366018 bases of 175247318 (33.876%) in intersection

##############################################################################
## Repeat masker with -species "caenorhabditis sp. 4 (ce-ii)"
##	(DONE - 2007-04-19 - Hiram)

    ssh kkstore01
    mkdir /cluster/data/caePb1/bed/RepeatMasker
    cd /cluster/data/caePb1/bed/RepeatMasker
    time nice -n +19 doRepeatMasker.pl -species "caenorhabditis sp. 4 (ce-ii)"

    time nice -n +19 doRepeatMasker.pl -species "caenorhabditis" \
	-bigClusterHub=kk \
	-buildDir=/cluster/data/caePb1/bed/RepeatMasker caePb1 > do.log 2>&1 &
    #	real    118m40.879s

    cd /cluster/data/caePb1
    twoBitToFa caePb1.rmsk.2bit stdout | faSize stdin
    #	207333000 bases (32085682 N's 175247318 real
    #	169630672 upper 5616646 lower) in 1 sequences in 1 files
    #	%2.71 masked total, %3.20 masked real

    ssh hgwdev
    featureBits caePb1 rmsk
    #	5618476 bases of 175247318 (3.206%) in intersection

##############################################################################
## BLASTZ swap ce4 (DONE - 2007-03-04 -- Hiram)
    ssh kkstore01
    cd  /cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04
    cat fb.ce4.chainCaePb1Link.txt
    #	52594760 bases of 100281244 (52.447%) in intersection

    mkdir /cluster/data/caePb1/bed/blastz.ce4.swap
    cd /cluster/data/caePb1/bed/blastz.ce4.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	-qRepeats=windowmaskerSdust \
	/cluster/data/ce4/bed/blastz.caePb1.um.2007-03-04/DEF \
	-bigClusterHub=pk -swap > swap.log 2>&1 &
    #	real    22m40.205s
    cat fb.caePb1.chainCe4Link.txt
    #	74758640 bases of 175247318 (42.659%) in intersection

##############################################################################
## BLASTZ caeRem2 (DONE - 2007-04-19,20 - Hiram)
    ssh kkstore01
    mkdir /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19
    cd /cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19

    cat << '_EOF_' > DEF
# caePb1 vs caeRem2
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: C. PB2801 caePb1 unmasked sequence
SEQ1_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: briggsae caeRem2, 9,660 contigs, longest 5,925,111
SEQ2_DIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/caeRem2/chrom.sizes
SEQ2_CTGDIR=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.2bit
SEQ2_CTGLEN=/san/sanvol1/scratch/worms/caeRem2/caeRem2.contigs.sizes
SEQ2_LIFT=/san/sanvol1/scratch/worms/caeRem2/caeRem2.chrUn.lift
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19
TMPDIR=/scratch/tmp
'_EOF_'
    # << happy emacs

    time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=kk \
	-blastzOutRoot /cluster/bluearc/caePb1CaeRem2 > do.log 2>&1 &
    #	real    569m7.643s
    cat fb.caePb1.chainCaeRem2Link.txt
    #	93139445 bases of 175247318 (53.147%) in intersection

    #	swap to caeRem2  (also in caeRem2.txt)
    mkdir /cluster/data/caeRem2/bed/blastz.caePb1.swap
    cd /cluster/data/caeRem2/bed/blastz.caePb1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 \
	/cluster/data/caePb1/bed/blastz.caeRem2.2007-04-19/DEF \
	-bigClusterHub=kk -swap > swap.log 2>&1 &
    #	real    32m51.630s
    cat fb.caeRem2.chainCaePb1Link.txt
    #	78402339 bases of 146898439 (53.372%) in intersection

##############################################################################
## BLASTZ priPac1 (DONE - 2007-04-20,21 - Hiram)
    mkdir /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20
    cd /cluster/data/caePb1/bed/blastz.priPac1.2007-04-20

    cat << '_EOF_' > DEF
# caePb1 vs priPac1
BLASTZ_H=2000
BLASTZ_M=50

# TARGET: C. PB2801 caePb1 unmasked sequence
SEQ1_DIR=/san/sanvol1/scratch/worms/caePb1/caePb1.unmasked.2bit
SEQ1_LEN=/san/sanvol1/scratch/worms/caePb1/chrom.sizes
SEQ1_CHUNK=1000000
SEQ1_LAP=10000
SEQ1_LIMIT=50

# QUERY: Pristionchus pacificus priPac1
SEQ2_DIR=/san/sanvol1/scratch/worms/priPac1/priPac1.unmasked.2bit
SEQ2_LEN=/san/sanvol1/scratch/worms/priPac1/chrom.sizes
SEQ2_CHUNK=1000000
SEQ2_LAP=10000
SEQ2_LIMIT=50

BASE=/cluster/data/caePb1/bed/blastz.priPac1.2007-04-20
TMPDIR=/scratch/tmp
'_EOF_'

    time nice -n +19 doBlastzChainNet.pl DEF -verbose=2 -bigClusterHub=pk \
	-blastzOutRoot /cluster/bluearc/caePb1PriPac1 > do.log 2>&1 &
    #	real    406m52.445s
    cat fb.caePb1.chainPriPac1Link.txt
    #	13589188 bases of 175247318 (7.754%) in intersection

    ## swap to priPac1 - also in priPac1.txt
    mkdir /cluster/data/priPac1/bed/blastz.caePb1.swap
    cd /cluster/data/priPac1/bed/blastz.caePb1.swap
    time nice -n +19 doBlastzChainNet.pl -verbose=2 -bigClusterHub=pk \
	/cluster/data/caePb1/bed/blastz.priPac1.2007-04-20/DEF \
	-swap > swap.log 2>&1 &
    #	real    9m18.554s
    cat fb.priPac1.chainCaePb1Link.txt
    #	11713567 bases of 145948246 (8.026%) in intersection

#########################################################################
# MAKE 11.OOC FILE FOR BLAT/GENBANK (DONE - 2007-04-23 - Hiram)
    # Use -repMatch=62 (based on size -- for human we use 1024, and 
    # C. PB2801 size is ~6.1% of human judging by gapless caePb1 vs. hg18 
    # genome sizes from featureBits.
    ssh kolossus
    cd /cluster/data/caePb1
    blat caePb1.2bit /dev/null /dev/null -tileSize=11 \
      -makeOoc=jkStuff/11.ooc -repMatch=62
    #	Wrote 37423 overused 11-mers to jkStuff/11.ooc
    cp -p jkStuff/11.ooc /san/sanvol1/scratch/worms/caePb1

#########################################################################
# GENBANK AUTO UPDATE (DONE - 2007-04-23,25 - Hiram)
    # align with latest genbank process.
    ssh hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # edit etc/genbank.conf to add caePb1 just before caeRem2

# caePb1 (C. PB2801)
caePb1.serverGenome = /cluster/data/caePb1/caePb1.2bit
caePb1.clusterGenome = /iscratch/i/worms/caePb1/caePb1.2bit
caePb1.ooc = /iscratch/i/worms/caePb1/11.ooc
caePb1.lift = /iscratch/i/worms/caePb1/caePb1.supercontigs.lift
caePb1.refseq.mrna.native.pslCDnaFilter  = ${lowCover.refseq.mrna.native.pslCDnaFilter}
caePb1.refseq.mrna.xeno.pslCDnaFilter    = ${lowCover.refseq.mrna.xeno.pslCDnaFilter}
caePb1.genbank.mrna.native.pslCDnaFilter = ${lowCover.genbank.mrna.native.pslCDnaFilter}
caePb1.genbank.mrna.xeno.pslCDnaFilter   = ${lowCover.genbank.mrna.xeno.pslCDnaFilter}
caePb1.genbank.est.native.pslCDnaFilter  = ${lowCover.genbank.est.native.pslCDnaFilter}
caePb1.refseq.mrna.native.load = no
caePb1.refseq.mrna.xeno.load  = yes
caePb1.refseq.mrna.xeno.loadDesc = yes
caePb1.genbank.mrna.xeno.load = yes
caePb1.genbank.est.native.load = yes
caePb1.genbank.est.native.loadDesc = no
caePb1.downloadDir = caePb1
caePb1.perChromTables = no

    cvs ci -m "Added caePb1." etc/genbank.conf
    # update /cluster/data/genbank/:
    make etc-update

    # Edit src/lib/gbGenome.c to add new species.
    cvs ci -m "Added C. PB2801." src/lib/gbGenome.c
    make install-server

    ssh kkstore01
    cd /cluster/data/genbank
    time nice -n +19 bin/gbAlignStep -initial caePb1 &
    #	logFile: var/build/logs/2007.07.10-17:01:24.caePb1.initalign.log
    #	real    353m16.891s

    # load database when finished
    ssh hgwdev
    cd /cluster/data/genbank
    time nice -n +19 ./bin/gbDbLoadStep -drop -initialLoad caePb1
    #	logFile: var/dbload/hgwdev/logs/2007.07.11-12:52:25.dbload.log
    #	real    7m49.325s

    # enable daily alignment and update of hgwdev
    cd ~/kent/src/hg/makeDb/genbank
    cvsup
    # add caePb1 to:
        etc/align.dbs
        etc/hgwdev.dbs
    cvs ci -m "Added caePb1 - C. PB2801" etc/align.dbs etc/hgwdev.dbs
    make etc-update

############################################################################
#  BLATSERVERS ENTRY (DONE - 2007-04-23 - Hiram)
#	After getting a blat server assigned by the Blat Server Gods,
    ssh hgwdev

    hgsql -e 'INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caePb1", "blat10", "17786", "1", "0"); \
	INSERT INTO blatServers (db, host, port, isTrans, canPcr) \
	VALUES ("caePb1", "blat10", "17787", "0", "1");' \
	    hgcentraltest
    #	test it with some sequence

##########################################################################
## summarize chainLink measurements (2007-04-25 - Hiram)
# org       on caePb1  on other
# caeRem2   53.147     53.372
# ce4       42.659     52.447
# cb3       33.876     39.446
# priPac1    7.754      8.026

##########################################################################
#  Set default position to cover the 16S ribosomal RNA area
##	(DONE - 2007-04-26 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set defaultPos="chrUn:97956213-97960870"
	where name="caePb1"' hgcentraltest

##########################################################################
## This nematode now has an official name (DONE - 2007-04-27 - Hiram)
    ssh hgwdev
    hgsql -e 'update dbDb set organism="C. brenneri"
	where name="caePb1"' hgcentraltest
    hgsql -e 'update dbDb set genome="C. brenneri"
	where name="caePb1"' hgcentraltest
    hgsql -e 'update dbDb set scientificName="Caenorhabditis brenneri"
	where name="caePb1"' hgcentraltest
    hgsql -e 'update defaultDb set genome="C. brenneri"
	where name="caePb1"' hgcentraltest
    hgsql -e 'update genomeClade set genome="C. brenneri"
	where genome="C. PB2801"' hgcentraltest

##########################################################################
## Creating downloads (DONE - 2007-04-30 - Hiram)
    #	There is only one chrom, make its trfMaskChrom file exist
    ssh hgwdev
    mkdir /cluster/data/caePb1/bed/simpleRepeat/trfMaskChrom
    cd /cluster/data/caePb1/bed/simpleRepeat/trfMaskChrom
    ## symlink didn't work here, the symlink ended up in the tar file
    cp -p ../trfMask.bed ./chrUn.bed
    cd  /cluster/data/caePb1
    /cluster/bin/scripts/makeDownloads.pl caePb1 
    ## *!* EDIT THE README.txt FILES *!* ##

##########################################################################
## Creating pushQ (DONE - 2007-04-30 - Hiram)
    ssh hgwdev
    mkdir /cluster/data/caePb1/pushQ
    cd /cluster/data/caePb1/pushQ
    /cluster/bin/scripts/makePushQSql.pl caePb1 > caePb1.sql 2> stderr.out

###########################################################################
# ELEGANS (ce4) PROTEINS TRACK (DONE -  Hiram - 2007-05-02)
    ssh kkstore01

    #	breaking up this target genome into manageable pieces
    mkdir /cluster/data/caePb1/blastDb
    cd /cluster/data/caePb1
    twoBitToFa caePb1.unmasked.2bit temp.fa
    faSplit gap temp.fa 1000000 blastDb/x -lift=blastDb.lft
    #	222 pieces of 222 written
    rm temp.fa
    cd blastDb
    for i in *.fa
    do
	/cluster/bluearc/blast229/formatdb -i $i -p F
    done
    rm *.fa

    ##	copy to san for kluster access
    mkdir -p /san/sanvol1/scratch/worms/caePb1/blastDb
    cd /san/sanvol1/scratch/worms/caePb1/blastDb
    rsync -a --progress --stats /cluster/data/caePb1/blastDb/ .

    ## create the query protein set
    mkdir -p /cluster/data/caePb1/bed/tblastn.ce4SG
    cd /cluster/data/caePb1/bed/tblastn.ce4SG
    echo  /san/sanvol1/scratch/worms/caePb1/blastDb/*.nsq | xargs ls -S \
	| sed "s/\.nsq//"  > query.lst
    wc -l query.lst
# 222 query.lst

   # we want around 50000 jobs
   calc `wc /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl | awk "{print \\\$1}"`/\(50000/`wc query.lst | awk "{print \\\$1}"`\)
23192/(50000/222) = 102.972480
    #	round up this number a bit, and use in the split here

   mkdir -p /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa
   split -l 103 /cluster/data/ce4/bed/blat.ce4SG/ce4SG.psl \
	/cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa/sg
   ln -s /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/sgfa sgfa
   cd sgfa
   for i in *; do 
     nice pslxToFa $i $i.fa; 
     rm $i; 
   done
   cd ..
   ls -1S sgfa/*.fa > sg.lst
   mkdir -p /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut
   ln -s /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut ./blastOut
   for i in `cat sg.lst`; do  mkdir blastOut/`basename $i .fa`; done
   
   cd /cluster/data/caePb1/bed/tblastn.ce4SG
   cat << '_EOF_' > template
#LOOP
blastSome $(path1) {check in line $(path2)} {check out exists blastOut/$(root2)/q.$(root1).psl }
#ENDLOOP
'_EOF_'
    # << happy emacs

   cat << '_EOF_' > blastSome
#!/bin/sh
BLASTMAT=/cluster/bluearc/blast229/data
export BLASTMAT
g=`basename $2`
f=/tmp/`basename $3`.$g
for eVal in 0.01 0.001 0.0001 0.00001 0.000001 1E-09 1E-11
do
if /cluster/bluearc/blast229/blastall -M BLOSUM80 -m 0 -F no -e $eVal -p tblastn -d $1 -i $2 -o $f.8
then
        mv $f.8 $f.1
        break;
fi
done
if test -f  $f.1
then
    if /cluster/bin/i386/blastToPsl $f.1 $f.2
    then
	liftUp -nosort -type=".psl" -nohead $f.3 /cluster/data/caePb1/blastDb.lft carry $f.2
        liftUp -nosort -type=".psl" -pslQ -nohead $3.tmp /cluster/data/ce4/bed/blat.ce4SG/protein.lft warn $f.3

        if pslCheck -prot $3.tmp
        then
            mv $3.tmp $3
            rm -f $f.1 $f.2 $f.3 $f.4
        fi
        exit 0
    fi
fi
rm -f $f.1 $f.2 $3.tmp $f.8 $f.3 $f.4
exit 1
'_EOF_'
    # << happy emacs
    chmod +x blastSome

    ssh pk
    cd /cluster/data/caePb1/bed/tblastn.ce4SG
    gensub2 query.lst sg.lst template jobList
    para create jobList
#    para try, check, push, check etc.
# Completed: 50172 of 50172 jobs
# CPU time in finished jobs:     944883s   15748.06m   262.47h   10.94d  0.030 y
# IO & Wait Time:                193584s    3226.39m    53.77h    2.24d  0.006 y
# Average job time:                  23s       0.38m     0.01h    0.00d
# Longest finished job:             306s       5.10m     0.09h    0.00d
# Submission to last job:          3663s      61.05m     1.02h    0.04d

    # do the cluster run for chaining
    ssh pk
    mkdir /cluster/data/caePb1/bed/tblastn.ce4SG/chainRun
    cd /cluster/data/caePb1/bed/tblastn.ce4SG/chainRun
    cat << '_EOF_' > template
#LOOP
chainOne $(path1)
#ENDLOOP
'_EOF_'
    # << happy emacs

    cat << '_EOF_' > chainOne
(cd $1; cat q.*.psl | simpleChain -prot -outPsl -maxGap=50000 stdin /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut/c.`basename $1`.psl)
'_EOF_'
    # << happy emacs
    chmod +x chainOne

    ls -1dS /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG/blastOut/sg?? \
	> chain.lst
    gensub2 chain.lst single template jobList
    para create jobList
    para maxNode 30
    para try, check, push, check etc.
# Completed: 226 of 226 jobs
# CPU time in finished jobs:      13628s     227.13m     3.79h    0.16d  0.000 y
# IO & Wait Time:                 18809s     313.48m     5.22h    0.22d  0.001 y
# Average job time:                 144s       2.39m     0.04h    0.00d
# Longest finished job:            1492s      24.87m     0.41h    0.02d
# Submission to last job:          1499s      24.98m     0.42h    0.02d

    ssh kkstore01
    cd /cluster/data/caePb1/bed/tblastn.ce4SG/blastOut
    for i in sg??
    do
       cat c.$i.psl | awk "(\$13 - \$12)/\$11 > 0.6 {print}" > c60.$i.psl
       sort -rn c60.$i.psl | pslUniq stdin u.$i.psl
       awk "((\$1 / \$11) ) > 0.60 { print   }" c60.$i.psl > m60.$i.psl
       echo $i
    done
    sort -T /scratch/tmp -k 14,14 -k 16,16n -k 17,17n u.*.psl m60* | uniq \
	> /cluster/data/caePb1/bed/tblastn.ce4SG/blastCe4SG.psl
    cd ..
    pslCheck blastCe4SG.psl
    #	checked: 32292 failed: 0

    # load table 
    ssh hgwdev
    cd /cluster/data/caePb1/bed/tblastn.ce4SG
    hgLoadPsl caePb1 blastCe4SG.psl

    # check coverage
    featureBits caePb1 blastCe4SG
    #	22988044 bases of 175247318 (13.117%) in intersection
    featureBits priPac1 blastCe4SG 
    #	5617285 bases of 145948246 (3.849%) in intersection
    featureBits caeRem2 blastCe4SG 
    #	19763359 bases of 146898439 (13.454%) in intersection
    featureBits cb3 blastCe4SG
    #	18218293 bases of 108433446 (16.801%) in intersection
    featureBits ce4 sangerGene
    #	27906202 bases of 100281244 (27.828%) in intersection

    ssh kkstore01
    rm -rf /cluster/data/caePb1/bed/tblastn.ce4SG/blastOut
    rm -rf /cluster/bluearc/worms/caePb1/bed/tblastn.ce4SG
    rmdir /cluster/bluearc/worms/caePb1/bed
    rmdir /cluster/bluearc/worms/caePb1
#end tblastn

#############################################################################
# LIFTOVER TO caePb2 (WORKING - 2008-05-14 - Hiram)
    ssh kkstore01
    screen -r -d	#	use screen to control this job
    # -debug run to create run dir, preview scripts...
    doSameSpeciesLiftOver.pl -debug caePb1 caePb2 \
      -ooc /cluster/data/caePb1/jkStuff/11.ooc
    # Real run:
    cd /cluster/data/caePb1/bed/blat.caePb2.2008-05-13
    time nice -n +19 doSameSpeciesLiftOver.pl caePb1 caePb2 \
      -ooc /cluster/data/caePb1/jkStuff/11.ooc > do.log 2>&1 &
    #	real    14m13.501s

    #	a scaffold to scaffold lift over file was made manually
    #	for Sheldon McKay.
    #	see also the scripts left in:
    #	/cluster/data/caePb1/bed/blat.caePb2.supers/run.blat
    #	/cluster/data/caePb1/bed/blat.caePb2.supers/run.chain
