#!/bin/sh
echo "This is not a script"
exit 255

#########  Creating the Intronerator for C. elegans WS120 version
#
#	This is done after the Ce2 browser has been built, using
#	the sequences from there.
#

#  Linking to sequence and creating NIB files (old style nibs)

ssh hgwdev
cd /projects/compbio/experiments/worm
mkdir idb.120
cd idb.120
mkdir sanger
ln -s /cluster/data/ce2/sanger/* sanger

mkdir nt4
cd nt4

#	program fatont4 is in kent/src/reformat/fatont4
#	The *.nt4 files created are machine specific.
#	You can not use nt4 files on different machines.
#	ALSO, the index files can not be used on different machines.
#	If you want to move the Intronerator to a different machine
#	you basically need to do this processing on that machine

cat << '_EOF_' > dna2nt4.sh
#!/bin/sh

S=/cluster/data/ce2/sangerFa
export S

DoOne() {
        if [ ! -f "$2.nt4" ]; then
                echo "chr$1.fa -> $2.nt4"
                rm -f $2.nt4 
                fatont4 ${S}/chr$1.fa $2.nt4
        else
                echo "$2.nt4 already complete"
        fi
}

DoOne M m
DoOne I i
DoOne II ii
DoOne III iii
DoOne IV iv
DoOne X x
DoOne V v
'_EOF_'

chmod +x dna2nt4.sh
time ./dna2nt4.sh

#  create features listings from gff files
#	binary makec2c is in kent/src/reformat/makec2c

ssh hgwdev
cd /projects/compbio/experiments/worm/idb.ws120
mkdir features
time makec2c sanger features/c2c /dev/null > features/make.c2c.out 2>&1
#	real    0m31.317s
#	user    0m27.290s
#	sys     0m3.220s

#	these *.dir files are used by some of the processing utilities
#	here and in the browser to find these directories
#
mkdir features/sanger
cd features/sanger
echo "/projects/compbio/experiments/worm/idb.ws120/cDNA/" > cdna.dir
echo "/projects/compbio/experiments/worm/idb.ws120/features/" > feat.dir
echo "/projects/compbio/experiments/worm/idb.ws120/nt4/" > nt4.dir
echo "/projects/compbio/experiments/worm/idb.ws120/xeno/" > xeno.dir
cd ../..
ln -s features/sanger/*.dir .

#	makepgo is in kent/src/index/makepgo

makepgo features/c2c features/ .coo

cd sanger
time gffgenes ../features/sanger/c2g ../features/sanger/genes.gdf \
	> ../features/sanger/gffgenes.out 2>&1
#	real    0m36.061s
#	user    0m31.660s
#	sys     0m3.660s
#	Processing CHROMOSOME_I.gff.gz
#	Read 6479 genes (103864 exons) in CHROMOSOME_I.gff.gz
#	Processing CHROMOSOME_II.gff.gz
#	Read 7541 genes (108565 exons) in CHROMOSOME_II.gff.gz
#	Processing CHROMOSOME_III.gff.gz
#	Read 5971 genes (93138 exons) in CHROMOSOME_III.gff.gz
#	Processing CHROMOSOME_IV.gff.gz
#	Read 7198 genes (106683 exons) in CHROMOSOME_IV.gff.gz
#	Processing CHROMOSOME_V.gff.gz
#	Read 10137 genes (139252 exons) in CHROMOSOME_V.gff.gz
#	Processing CHROMOSOME_X.gff.gz
#	Read 6141 genes (112575 exons) in CHROMOSOME_X.gff.gz

cd ..
cp -p features/sanger/c2g features
makepgo features/sanger/c2g features/sanger/ .pgo
#	Writing 6479 entries nameSize 20 into features/sanger/i.pgo
#	Writing 7541 entries nameSize 14 into features/sanger/ii.pgo
#	Writing 5971 entries nameSize 13 into features/sanger/iii.pgo
#	Writing 7198 entries nameSize 13 into features/sanger/iv.pgo
#	Writing 10137 entries nameSize 13 into features/sanger/v.pgo
#	Writing 6141 entries nameSize 12 into features/sanger/x.pgo
#	Writing 0 entries nameSize 0 into features/sanger/m.pgo

#	indexgl is in kent/src/index/indexgl

cd features/sanger
indexgl genes.gdf genes.ix
#	Reading input
#	Got 43467 offsets 21 nameSize.  Sorting...
#	Writing index file 

#	ixword3 is in kent/src/index/ixword3
cd ../..
ixword3 features/c2c features/c2c.ix
#	Got 3391 offsets 10 nameSize.  Sorting...
ixword3 features/c2g features/c2g.ix
#	Got 43467 offsets 21 nameSize.  Sorting...

#	down load genbank mRNA's 
#	mRNA_title.gb obtained from http://www.ncbi.nlm.nih.gov/entrez/
#	with a search on "nucleotide" and a search string of:
#	mRNA[Title] AND Caenorhabditis Elegans[Organism]

#	resulting in 254069 matches, downloaded in "GenBank" format
#	and a pretty significant file:

#	-rw-rw-r--    1 hiram    572034420 Mar  2 00:28 mRNA_title.gb

#	convert the downloaded genbank format into a .fa and a .cdi file:
#	can build gb2cdi binary in src/reformat/gb2cdi
#	make sure it gets into your cse home bin MACHTYPE directory
#	for this kks00 operation
#
ssh kks00
cd /projects/compbio/experiments/worm/idb.ws120/cDNA
gb2cdi mRNA_title.gb allcdna.fa allcdna.cdi > gb2cdi.out 2>&1
tail -4 gb2cdi.out
#	96094 by Kohara, 1515 by Martin, 3184 by Stratagene,
#		388 Univ Ariz/U Wash, 152880 full cDNAs
#	0 male 141394 hermaphrodite 86165 mixed
#	32 adult 71547 embryo 80454 varied 70110 L1 5180 L2 3346 L4 0 other
#	253941 cDNAs in all

#	previous counts were:
#	96094 by Kohara, 1515 by Martin, 3184 by Stratagene,
#		388 Univ Ariz/U Wash, 113712 full cDNAs
#	1 male 141467 hermaphrodite 47123 mixed
#	33 adult 55646 embryo 57368 varied 70127 L1 5180 L2 3346 L4 0 other
#	214773 cDNAs in all
#
#	indexfa is in kent/src/index/indexfa
#
time indexfa allcdna.fa allcdna.ix >> indexfa.out 2>&1
#	real    0m15.549s
#	user    0m5.660s
#	sys     0m1.680s
#
#	fixup names from WABA alignments from Ce2
#	to access cluster filesystems and projects/compbio, on hgwdev:
ssh hgwdev
cd /projects/compbio/experiments/worm/idb.ws120
mkdir xeno
cd xeno
mkdir cbriggsae
#!/bin/sh
#
RESULT=/projects/compbio/experiments/worm/idb.ws120/xeno/cbriggsae/all.st
THERE=/cluster/data/ce2/bed/waba/out/

for i in I II III IV V X M
do
        IChr=i
        case $i in
            I) IChr=i;;
            II) IChr=ii;;
            III) IChr=iii;;
            IV) IChr=IV;;
            V) IChr=v;;
            X) IChr=x;;
            M) IChr=m;;
        esac
        cd ${THERE}/chr${i}
        ls c*.wab | while read FN
        do
        sed -e \
"s#/iscratch/i/worms/Celegans2/unmaskedFa/chr${i}.fa.chr${i}#${IChr}#" ${FN}
        done
done > $RESULT

#	back on kks00, index that all.st file, and st files
#	program ixxenost is in kent/src/index/ixxenost
ssh kks00
cd /projects/compbio/experiments/worm/idb.ws120
time ixxenost xeno/cbriggsae/all.st xeno/cbriggsae/all.ix
#	real    4m29.919s
#	user    3m50.790s
#	sys     0m14.070s
#	stToXao is in kent/src/index/stToXao
#	this one reads the *.dir files to find things, expects to
#	find those files in current working director
time stToXao xeno/cbriggsae/all.st xeno/cbriggsae/
#	real    4m37.433s
#	user    3m51.040s
#	sys     0m14.650s

cd /projects/compbio/experiments/worm/idb.ws120
mkdir ra ea


#  prepare mRNA alignments cluster job
ssh eieio
mkdir /cluster/bluearc/ce2/nt4
mkdir /cluster/bluearc/ce2/exonAli
cp -p  /projects/compbio/experiments/worm/idb.ws120/cDNA/allcdna.* \
	/cluster/bluearc/ce2/exonAli
cp -p  /projects/compbio/experiments/worm/idb.ws120/nt4/*.nt4 \
	/cluster/bluearc/ce2/nt4
ssh kkr1u00
mkdir /iscratch/i/worms/Celegans2/nt4
cp -p /cluster/bluearc/ce2/nt4/*.nt4 /iscratch/i/worms/Celegans2/nt4
/cluster/bin/iSync
ssh kk
mkdir /cluster/data/ce2/bed/exonAli
cd /cluster/data/ce2/bed/exonAli
#	The number 253943 below comes from a count of the number of
#	records found in allcdna.fa
#	faCount allcdna.fa > faCount.out
#	wc faCount.out
#	253943 2031544 8308388 faCount.out
date | awk '
{
        for (i = 0; i < 253943; i+=200) {
printf "runOne {check out line+ results/%06d.out} %d 200\n", i, i
        }
}
' > jobList
echo '#!/bin/sh' > runOne
echo '/cluster/bin/i386/exonAli starting $1 \' >> runOne
echo '	/cluster/bluearc/ce2/exonAli/allcdna.fa \' >> runOne
echo '	/iscratch/i/worms/Celegans2/nt4 $2 $3 > out/$2.out' >> runOne
chmod +x runOne
para create jobList
para try
para push
para time
# Completed: 1270 of 1270 jobs
# CPU time in finished jobs:     161830s    2697.16m    44.95h    1.87d  0.005 y
# IO & Wait Time:                336112s    5601.87m    93.36h    3.89d  0.011 y
# Average job time:                 392s       6.53m     0.11h    0.00d
# Longest job:                      625s      10.42m     0.17h    0.01d
# Submission to last job:           725s      12.08m     0.20h    0.01d

cat results/*.out > /projects/compbio/experiments/worm/idb/idb.ws120/ea/all.out

#	refine those alignments  (refiAli source is in src/cdnaAli/refineAli)
ssh kks00
cd /projects/compbio/experiments/worm/idb.ws120
#	this uses the ea/all.out which is from blasting the mRNA's
time refiAli ea/all.out cDNA/allcdna nt4 ra/good.txt \
        ra/bad.txt ra/cool.txt ra/err.txt 0 1000000 features/sanger/c2g \
        > ra/refiAli.out 2>&1
#	real    100m22.510s
#	user    96m34.360s
#	sys     1m29.580s
#	binGood is in kent/src/cdnaAli/binGood and it needs the object
#	in kent/src/cdnaAli/lib/cdnaAli.o
binGood ra/good.txt cDNA/good.ali > cDNA/binGood.out 2>&1
#	real    0m14.993s
#	user    0m12.610s
#	sys     0m1.010s
#	ixali is in kent/src/index/ixali
ixali cDNA/good.ali cDNA/good.ix > cDNA/ixali.out 2>&1
#	real    0m4.795s
#	user    0m4.290s
#	sys     0m0.410s
#	ali2alx is in kent/src/cdnaAli/ali2alx
ali2alx cDNA/good.ali cDNA/ > cDNA/ali2alx.out 2>&1
#	real    0m4.038s
#	user    0m3.770s
#	sys     0m0.190s
#    cdnaOff is in kent/src/cdnaAli/cdnaOff
cdnaOff ra/good.txt cDNA/ > cDNA/cdnaOff.out 2>&1
#	real    0m12.397s
#	user    0m11.490s
#	sys     0m0.570s

XXXX
mkdir html
mkdir obsolete
#	The ra/good.txt is the input, all the other files on the command
#	line are created by this command,
#	also using: features/c2g.ix features/c2c.ix
time introns ra/good.txt cDNA/introns.gff cDNA/introns.txt \
        obsolete/altintron.txt html/altsplice.raw.html \
        cDNA/introns.fa > cDNA/introns.out 2>&1

#	Fetch gene name reference list:
cd /projects/compbio/experiments/worm/idb.ws120/sanger
wget --timestamping \
    ftp://ftp.wormbase.org/pub/wormbase/elegans/WS122/GENE_DUMPS/gene_names.txt

cd /projects/compbio/experiments/worm/idb.ws120
./features/gene_names2orf2gene.pl sanger/gene_names.txt > features/orf2gene.txt

makeOrf2gene features/orf2gene.txt sanger/orf2gene \
	sanger/syn > sanger/makeOrf2gene.out 2>&1
moresyn sanger/syn features/syn features/orf2gene \
	features/orfInfo > features/moresyn.out 2>&1

#
#	Ready to publish
#
ssh hgwdev
mkdir /cluster/store4/worm/WS120
mkdir /cluster/store4/worm/WS120/nt4
mkdir /cluster/store4/worm/WS120/cDNA
mkdir /cluster/store4/worm/WS120/features
mkdir /cluster/store4/worm/WS120/xeno
cd /cluster/store4/worm/WS120/nt4
cp -p /projects/compbio/experiments/worm/idb.ws120/nt4/*.nt4 .
mkdir /cluster/store4/worm/WS120/xeno/cbriggsae
cd /cluster/store4/worm/WS120/xeno/cbriggsae
cp -p /projects/compbio/experiments/worm/idb.ws120/xeno/cbriggsae/* .
mkdir /cluster/store4/worm/WS120/features/sanger
cd /cluster/store4/worm/WS120/features/sanger
cp -p /projects/compbio/experiments/worm/idb.ws120/features/sanger/* .
cd /cluster/store4/worm/WS120/features
cp -p /projects/compbio/experiments/worm/idb.ws120/features/* .
cd /cluster/store4/worm/WS120/cDNA
cp -p /projects/compbio/experiments/worm/idb.ws120/cDNA/* .

mkdir /gbdb/intronWS120
mkdir /gbdb/intronWS120/nt4
mkdir /gbdb/intronWS120/cDNA
mkdir /gbdb/intronWS120/features
mkdir /gbdb/intronWS120/xeno
ln -s /cluster/store4/worm/WS120/nt4/* /gbdb/intronWS120/nt4
ln -s /cluster/store4/worm/WS120/cDNA/* /gbdb/intronWS120/cDNA
ln -s /cluster/store4/worm/WS120/features/* /gbdb/intronWS120/features
ln -s /cluster/store4/worm/WS120/xeno/* /gbdb/intronWS120/xeno

mkdir /usr/local/apache/htdocs/IntronWS120
cd /usr/local/apache/htdocs/IntronWS120
echo "/gbdb/intronWS120/cDNA/" > cdna.dir
echo "/gbdb/intronWS120/features/" > feat.dir
echo "/gbdb/intronWS120/nt4/" > nt4.dir
echo "/gbdb/intronWS120/xeno/" > xeno.dir

