#!/usr/bin/perl

# This script was originally written by Yontao Lu and was copied from
# /cluster/store4/ratMarker/code/ into the kent/src tree in order
# to get it under CVS control; added warnings & strict and incorporated a 
# fix for getN from /cluster/store5/mouseMarker/code/luConvertPrimerToFa.
# Also, I parse size ranges and compute the average, instead of feeding 
# the ranges directly to the calculation for nSize which seems to result 
# in the lower number being used.

# $Id: luConvertPrimerToFa,v 1.2 2007/08/03 16:45:40 hiram Exp $

use warnings;
use strict;

##################################################
##Yontao Lu
##date: 12/23/02
##Des: Take a bed file and output a fasta file.
##1. Just know Forwards and reverse put N in it according to the size.
##If size don't know put default size which in average expected-size
##2. If know clone sequence, put clone sequence as another sequence entry
##with "|clone" 
###########################################################################



my $n25s = "nnnnnnnnnnnnnnnnnnnnnnnnn";

sub getN
{
    my ($nSize, @rest) = @_;
    my $lines = int($nSize / 25);
    my $remainder = $nSize % 25;
    my $ns = "";
    for(my $j = 0; $j < $lines - 1; $j++)
    {
	 $ns .= "$n25s\n";
    }
    $ns .="$n25s";
    my $leftSeq = substr($n25s, 0, $remainder);
    $ns .= "\n$leftSeq" if ($leftSeq ne "");
    return $ns;
}


sub reverse
{
    my (@primer) = @_;
    my @newS = ();
    #my $errors = 0; ## For testing purpose.
    for(my $k = 0; $k < @primer; $k++)
    {
	$primer[$k] =~ s/\s+//g;
	my @chars = split(//, $primer[$k]);
	my @newChar = ();
	for(my $j = $#chars; $j >= 0; $j--)
	{
	    if($chars[$j] eq 'a' || $chars[$j] eq 'A')
	    {
		push(@newChar, "T");
	    }
	    elsif($chars[$j] eq 't'|| $chars[$j] eq 'T')
	    {
		push(@newChar, "A");
	    }
	    elsif($chars[$j] eq 'c' || $chars[$j] eq 'C')
	    {
		push(@newChar, "G");
	    }
	    elsif($chars[$j] eq 'g' || $chars[$j] eq 'G')
	    {
		push(@newChar, "C");
	    }
	    else
	    {
		#$errors = 1;
		push(@newChar, "N");
	    }
	}
	my $newStr = join ("",@newChar);
	push(@newS, $newStr);
    }
    #return ($errors, @newS); ##just for testing purpose
    return (@newS);
}




#set default parameter
my $header = 0;
my $aveSize = 150;
my $ucscId = 1;
my @file = ();

while(my $arg = shift)
{
    if($arg eq "-h")
    {
        $header = 1;
    }
    elsif($arg eq "-a")
    {
        $aveSize = shift;
    }
    else
    {
        push(@file, $arg);
    }
}

if(@file < 4)
{
    print STDERR "Usage: luConvertPrimerToFa [-a avgSize] [-h] stsInfo primer.fa clone.fa primer.info
-a: average size.
-h: has header. default no header.\n";
    exit(1);
}


#open file
open(INFO, "<$file[0]") || die "Can't open marker file. $file[0] $!";
open(FA, ">$file[1]") || die "Can't open primer file. $file[1] $!";
open(CLON, ">$file[2]") || die "Can't open clone file. $file[2] $!";
open(PRI, ">$file[3]") || die "Can't open primer file. $file[3] $!";
while(my $line = <INFO>)
{
    chomp($line);
    my (@eles ) = split(/\t/, $line);
    
    #  size field may specify KB, BP, ~, or a comma separated list
    my $kbMultiplier = 1;
    if($eles[9] =~ m/KB/i) { $kbMultiplier = 1000; }
    $eles[9] =~ s/\~|\s*KB\s*|BP//gi;	#	remove known garbage
    my @sizeList = split(',\s*|-',$eles[9]);
    my $sizeSum = 0;
    my $val = $aveSize;
    if (scalar(@sizeList) > 0)
    {
	for (my $j = 0; $j < scalar(@sizeList); ++$j)
	{
	    if($sizeList[$j] !~ /\d/o ||
	       ($sizeList[$j] =~ /^\d+$/o && $sizeList[$j] == 0))
	    {
		$sizeSum += $aveSize;
	    } else {
		$sizeSum += $sizeList[$j] * $kbMultiplier;
	    }
	}
	$val = int($sizeSum / scalar(@sizeList));
    }
    my $priInfo = $val;
   
    if($eles[7] ne "" && $eles[7] ne "N/A")
    {
	my ($ends, @rest) = &reverse($eles[8]);

	my @forwards = split(//,$eles[7]);
	my @rewards = split(//,$ends);
	my $ns = getN(($val-$#forwards-$#rewards-2));
	print FA ">$eles[0]_$eles[1]
$eles[7]
${ns}
$ends\n";
	print PRI "$eles[0]\t$eles[7]\t$eles[8]\t$priInfo\t$eles[1]\n";
    }
    
    if($eles[24] && $eles[24] ne "N/A" && $eles[24] =~ /\w/o)
    {
	#my (@seqs) = split("", $eles[24]);
	print  CLON ">$eles[0]_$eles[1]_clone\n";
	my $len = length($eles[24]);
	for(my $i=0; $i < $len; $i=$i+50)
	{
	    print CLON substr($eles[24], $i, 50),"\n";
#$eles[24]\n";
	}

   }
}

close(INFO);
close(FA);
close(CLON);
close(PRI);
