#!/bin/csh -f
# "Make doc" for creating AGP files for ENCODE ortholog regions based on
#  coordinates in bed files in the "consensus" directory

# TODO: contigAcc files should go in $outDir (neater this way)

set orthoDir     = /cluster/data/encode/ortho2
set agpDir       = $orthoDir/agp
set consensusDir = $orthoDir/consensus
set downloadDir  = /usr/local/apache/htdocs/encode/downloads/EncodeAgps2
set regionAgp    = /cluster/home/kate/bin/i386/regionAgp
mkdir -p $agpDir $downloadDir
chmod o+rx $downloadDir
rm -rf $agpDir/* $downloadDir/*
cd $agpDir

########################################################################
# create script to run AGP-maker on an  assembly
rm -rf genomeAgps.csh >& /dev/null
cat > genomeAgps.csh << 'EOF'
# create AGP's for a genome assembly

if ($#argv != 3) then
    echo "usage: $0 <db> <org> <outdir>"
    exit 1
endif
echo
echo command line: $0 $1 $2 $3
echo working dir:  `pwd`

set consensusDir = /cluster/data/encode/ortho2/consensus
set db = $1
set org = $2
set outDir = $3
set buildDir = /cluster/data/$db
set bedFile = $consensusDir/$db.bed5
set regionAgp = /cluster/home/kate/bin/i386/regionAgp
mkdir -p $outDir

if (-f $org.contig.tab) then
    set contigArg = "-contigFile=$org.contig.tab"
else
    set contigArg = ""
endif

if (-f $buildDir/chrom.lst) then
    cat $buildDir/chrom.lst | \
    xargs -iX cat $buildDir/X/chr{X,X_random}.agp | \
    $regionAgp  $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir
else
    cat $buildDir/?{,?}/*.agp | \
    $regionAgp $contigArg -namePrefix=${org}_ $bedFile stdin -dir $outDir
endif
'EOF'
# << this line makes emacs coloring happy

rm -f *.contig.tab */*.packing.list >& /dev/null

########################################################################
# MOUSE
ssh hgwdev
cd /cluster/data/encode/ortho2/agp
set mouse = mm5

# Create AGP's
csh genomeAgps.csh $mouse mouse $mouse

# Create packing list
set consensusDir = $orthoDir/consensus
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $consensusDir/$mouse.bed5 $mouse mouse "Mus musculus" 10090 \
        C57BL/6J MAY-2004 $mouse "NCBI Build 33" > $mouse/$mouse.packing.list

# Copy  to downloads area
cp -f $mouse/$mouse.packing.list $downloadDir
tar cvfz $downloadDir/$mouse.agp.tar.gz $mouse/*.agp

hgsql -e "select count(*) from $mouse.encodeRegionConsensus"
tar tvfz $downloadDir/$mouse.agp.tar.gz | wc -l
grep Region $downloadDir/$mouse.packing.list | wc -l

n######################################################################a
# RAT
set rat = rn3

# Get contig to accession map
hgsql $rat -s -e "select * from contigAcc" > rat.contig.tab

# Create AGP's
csh genomeAgps.csh $rat rat $rat

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    $consensusDir/$rat.bed5 $rat  rat "Rattus norvegicus" 10116  \
    BN/SsNHsdMCW JUN-2003 $rat "Baylor HGSC v3.1" > $rat/$rat.packing.list

# Copy  to downloads area
cp -f $rat/$rat.packing.list $downloadDir
tar cvfz $downloadDir/$rat.agp.tar.gz $rat/*.agp

hgsql -e "select count(*) from $rat.encodeRegionConsensus"
tar tvfz $downloadDir/$rat.agp.tar.gz | wc -l
grep Region $downloadDir/$rat.packing.list | wc -l

########################################################################
# CHICKEN
ssh hgwdev
cd /cluster/data/encode/ortho2/agp
set chicken = galGal2

#reversed order here to make $chicken.contig.tab before making agps
# Get contig to accession mapping (documented in makeGalGal2.doc) 
hgsql $chicken -s -e "select * from contigAcc" > chicken.contig.tab

# Create AGP's
# errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files
csh genomeAgps.csh $chicken chicken $chicken

# Test a region
ssh kksilo
cd /cluster/data/encode/ortho2/agp
mkdir tests
cd tests
gunzip -c  /cluster/data/galGal2/chicken_contigs.tar.gz > \
                test.chicken.contigs.fa

ssh hgwdev
cd /cluster/data/encode/ortho2/agp/tests
set db = galGal2
set org = chicken
set region = ENm001_1
/cluster/data/encode/bin/scripts/agpAccToContig.pl $db \
        ../$db/${org}_${region}.agp > $db.$region.agp

ssh kolossus
cd /cluster/data/encode/ortho2/agp/tests
set org = chicken
set db = galGal2
set region = ENm001_1
set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/ortho2/consensus/$db.bed`
echo $coords
set chr = $coords:r:r
nibFrag /cluster/data/$db/nib/${chr}.nib \
        $coords:r:e $coords:e + nibTest.$db.$region.fa

# takes 7 minutes on kksilo, 2-6 mins on kolossus
set fa = chicken.contigs.fa
time agpToFa $db.$region.agp ${org}_$region $db.$region.fa -simpleMulti $fa

faCmp nibTest.$db.$region.fa $db.$region.fa
    # test.fa and nibtest.fa are the same


# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    /cluster/data/encode/ortho2/consensus/$db.bed5 $db $org "Gallus gallus" \
    9031 N/A FEB-2004 $db "CGSC Feb. 2004" > ${db}/$db.packing.list

# Copy  to downloads area
cp -f $db/$db.packing.list $downloadDir
tar cvfz $downloadDir/$db.agp.tar.gz $db/*.agp

hgsql -e "select count(*) from $db.encodeRegionConsensus"
tar tvfz $downloadDir/$db.agp.tar.gz | wc -l
grep Region $downloadDir/$db.packing.list | wc -l


########################################################################
# DOG
ssh hgwdev
cd /cluster/data/encode/ortho2/agp
set dog = canFam1

#reversed order here to make $dog.contig.tab before making agps
# Get contig to accession mapping (documented in makeGalGal2.doc) 
hgsql $dog -s -e "select * from contigAcc" > dog.contig.tab

# Create AGP's
# errors here: missing /cluster/data/galGal2/chr*_random.agp and chrM.agp files
csh genomeAgps.csh $dog dog $dog

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    /cluster/data/encode/ortho2/consensus/$dog.bed5 $dog dog \
        "Canis Familiaris" 9615 N/A JUL-2004 $dog \
                "Broad Institute v. 1.0" > $dog/$dog.packing.list

# Copy  to downloads area
cp -f $dog/$dog.packing.list $downloadDir
tar cvfz $downloadDir/$dog.agp.tar.gz $dog/*.agp

hgsql -e "select count(*) from $dog.encodeRegionConsensus"
tar tvfz $downloadDir/$dog.agp.tar.gz | wc -l
grep Region $downloadDir/$dog.packing.list | wc -l

########################################################################
# CHIMP
ssh hgwdev
cd /cluster/data/encode/ortho2/agp
set chimp = panTro1
mkdir -p $chimp
echo
echo CHIMP

# Get contig to accession map
hgsql $chimp -s -e "select * from contigAcc" > chimp.contig.tab

# Create AGP's
# NOTE: next time, put chimpChromContigs.agp into $outDir
cat /cluster/data/$chimp/?{,?}/*.agp | \
   chimpChromContigAgp stdin /cluster/data/$chimp/assembly.agp chimpChromContigs.agp
set consensusDir = /cluster/data/encode/ortho2/consensus
~kate/bin/i386/regionAgp -contigFile=chimp.contig.tab -namePrefix=chimp_ \
    $consensusDir/$chimp.bed5 chimpChromContigs.agp -dir $chimp

# Create packing list
/cluster/data/encode/bin/scripts/encodeRegionPackingList $consensusDir/$chimp.bed5 \
    $chimp  chimp "Pan troglodytes" 9598 N/A NOV-2003 $chimp \
    "NCBI Build 1 v1" > $chimp/$chimp.packing.list

# Copy  to downloads area
cp -f $chimp/$chimp.packing.list $downloadDir
tar cvfz $downloadDir/$chimp.agp.tar.gz $chimp/*.agp

set downloadDir  = /usr/local/apache/htdocs/encode/downloads/EncodeAgps2
hgsql -e "select count(*) from $chimp.encodeRegionConsensus"
tar tvfz $downloadDir/$chimp.agp.tar.gz | wc -l
grep Region $downloadDir/$chimp.packing.list | wc -l

########################################################################
# cleanup

chmod -R o+r $downloadDir/* >& /dev/null
exit

########################################################################
# TESTS 

rm -fr testMouse
mkdir -p testMouse
cat /cluster/data/$mouse/{6,11}/*.agp | $regionAgp tests/test.bed \
    stdin -namePrefix=mouse_ -dir testMouse
/cluster/data/encode/bin/scripts/encodeRegionPackingList tests/test.bed \
    testMouse mouse "Mus musculus" 10090 C57BL/6J MAY-2004 $mouse "NCBI Build 33"

rm -fr testChicken
mkdir -p testChicken
rm -f $chicken.contig.tab
hgsql $chicken -s -e "select * from contigAcc" > chicken.contig.tab
cat /cluster/data/$chicken/{1,13}/*.agp | \
    $regionAgp tests/test.chicken.bed stdin -namePrefix=chicken_ \
    -dir testChicken -contigFile=chicken.contig.tab

/cluster/data/encode/bin/scripts/encodeRegionPackingList \
    tests/test.chicken.bed testChicken chicken "Gallus gallus" \
    9031 N/A FEB-2004 $chicken "CGSC Feb. 2004"

########################################################################
# Test all chicken, dog, chimp AGP's
ssh hgwdev
cd /cluster/data/encode/ortho2/agp/tests
#foreach db (galGal2 canFam1 panTro1)
foreach db (panTro1)
    echo $db
    cat ../$db/*.agp > $db.agp
    /cluster/data/encode/bin/scripts/agpAccToContig.pl $db $db.agp > \
        $db.contig.agp
end

ssh kolossus
cd /cluster/data/encode/ortho2/agp/tests
cat > testFa.csh << 'EOF'
set db = $1
set org = $2
foreach f (../$db/*.agp)
    set seq = $f:t:r
    set region = `echo $seq | sed "s/${org}_//"`
    set coords = `awk -v REGION=$region '$4 == REGION {printf "%s.%d.%d", $1, $2, $3}' /cluster/data/encode/ortho2/consensus/$db.bed`
    set chr = $coords:r:r
    nibFrag /cluster/data/$db/nib/${chr}.nib \
        $coords:r:e $coords:e + $db.nibTest/$db.$region.fa
    faCmp $db.nibTest/$db.$region.fa $db.test/${org}_$region.fa
end
'EOF'
# << for emacs

# chicken
set db = galGal2
set org = chicken
set fa = ../$org.contigs.fa
mkdir $db.test $db.nibTest
time agpAllToFaFile $db.contig.agp $fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org >&! testFa.$org.log &

# dog
set db = canFam1
set org = dog
set fa = ../$org.contigs.fa
mkdir $db.test $db.nibTest
time agpAllToFaFile $db.contig.agp $fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org >&! testFa.$org.log &

# chimp
set db = panTro1
set org = chimp
set fa = /cluster/data/panTro1/contigs.bases
mkdir $db.test $db.nibTest
time agpAllToFaFile $db.contig.agp $fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh panTro1 chimp >&! testFa.chimp.log &


cd /cluster/data/encode/ortho2/agp/tests
cat > getSeqFromAcc.pl <<EOF
#!/usr/bin/perl -W
 
$URL = "http://www.ncbi.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&rettype=fasta&id=";
$db=shift;
$dir = "/cluster/data/encode/ortho2/agp/tests/$db.contig.fa";
system("mkdir -p $dir");
 
while (<>)
{
    chomp();
    system("wget -O $dir/$_ \"$URL$_\"");
}
EOF
#
chmod ug+x getSeqFromAcc.pl

cat > accHeader.pl <<EOF
#!/usr/bin/perl -W
$dir=shift;
$db=shift;
opendir(DIR,$dir) or die "Can't open $dir";
open (OUT, ">$db.contigAccs.fa");
@files = readdir(DIR);
foreach $file (@files)
{
    open (IN,  "<$dir/$file");
    while (<IN>)
    {
        if (/^>/)
        {
            @f=split /\|/;
            print OUT ">$f[3]\n";
        }
        else
        {
            print OUT;
        }
    }
    close(IN);
}
close(OUT);
EOF
#
chmod ug+x accHeader.pl

# rat
set db  = rn3
set org = rat
set fa  = $db.contig.fa
mkdir -p $db.test $db.nibTest

cat ../$db/${org}_EN*.agp | grep W | cut -f6 > $db.accessions
./getSeqFromAcc.pl $db < $db.accessions >& $db.log
./accHeader.pl $db.contig.fa $db >>& $db.log

agpAllToFaFile $db.contig.agp $fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org >&! testFa.$org.log &

# mouse
set db  = mm5
set org = mouse
set fa  = $db.contig.fa
mkdir -p $db.test $db.nibTest

cat ../$db/${org}_EN*.agp | grep W | cut -f6 > $db.accessions
./getSeqFromAcc.pl $db < $db.accessions >& $db.log
./accHeader.pl $db.contig.fa $db >>& $db.log

agpAllToFaFile $db.contig.agp $fa $db.fa
faSplit byname $db.fa $db.test/
csh  testFa.csh $db $org >&! testFa.$org.log &

