#!/usr/bin/perl -W
# takes dbSnp XML files as stdin and prints condensed information
# to stdout in one line per SNP in tab delimited format

$chrom=$strand=$alleles=$class=$func=$valid="";
$chromEnd=$rsId=$score=$het=$hetSE=0;

$source="dbSnp";

die "\nFix the chromosome issue - like the pseudoautosomal errors.  \nExtend this to the whole genome by getting the chromosome \nfrom within NSE-rs_contig-mapset rather than from the header\n\n";
do
{
    $line = <>;
    if ($line =~ /<(NSE-ExchangeSet_query)>select distinct snp_id from vwSNP_hs_chr where chr = &apos;(.*)&apos;<\/\1>/)
    { $chrom = "chr$2"; }
} until ( $line =~ /<NSE-rs>/);

while (<>) 
{
    if (/<(NSE-rsMaploc_physmap-int)>(\S+)<\/\1>/)          { $chromEnd = $2;}
    elsif (/<(NSE-rsContigHit_contig-id)>(\S+)<\/\1>/)      { $contig   = $2;}
    elsif (/<(NSE-rs_refsnp-id)>(\S+)<\/\1>/)               { $rsId     = $2;}
    elsif (/<(NSE-rsMaploc_orient value=\")(\S+)\"\/>/)     { $strand   = $2;}
    elsif (/<(NSE-rs_observed)>(\S+)<\/\1>/)                { $alleles  = $2;}
    elsif (/<(NSE-rs_snp-class value=\")(.*)\"\/>/)         { $class    = $2;}
    elsif (/<(NSE-rs_het)>(\S+)<\/\1>/)                     { $het      = $2;}
    elsif (/<(NSE-rs_het-SE)>(\S+)<\/\1>/ && ($2 ne "NaN")) { $hetSE    = $2;}
    elsif (/<(NSE-FxnSet_fxn-class-contig value=\")(\S+)\"\/>/)
    { if (index($func, $2) == -1) { $func  .= "$2,";} }
    elsif (/<(NSE-rs_validated-)(\S+) value=\"true\"\/>/)
    { if (index($valid,$2) == -1) { $valid .= "$2,";} }
    elsif (/<\/NSE-rsMaploc>/) # end of current location
    {   # store current pos/strand/fxn and reinit
	if    ( $contig =~ /b/ || $chromEnd == 0 ) { $chromEnd = 1; }
	if    ( $func eq "" )                      { $func     = "unknown,"; }
	if    ( $strand eq "forward" )             { $strand   = "+"; }
	elsif ( $strand eq "reverse" )             { $strand   = "-"; }
	$func =~ tr/ //;
	substr($func, -1) = ""; # remove last character
	$loc{$chromEnd}="$strand $func";
	$strand=$func="";
	$chromEnd=0;
    }
    elsif (/<\/NSE-rs>/) # end of current SNP
    {
	if    (index($valid,"other-pop")       != -1) { $score += 1; }
	elsif (index($valid,"by-frequency")    != -1) { $score += 2; }
	elsif (index($valid,"by-cluster")      != -1) { $score += 4; }
	elsif (index($valid,"by-2hit-2allele") != -1) { $score += 8; }
	if ($valid   eq "") {$valid = "no-information,"; }
	substr($valid, -1) = ""; # remove last character
	$alleles =~ tr/\//,/;
	if ($alleles eq "") { $alleles = "?"; }
	for $chromEnd (sort keys %loc )
	{
	    ($strand, $func) = split / /, $loc{$chromEnd};
	    $chromStart = $chromEnd - 1;
	    print  "$chrom\t$chromStart\t$chromEnd\trs$rsId\t$score\t$strand\t";
	    print  "$alleles\t$source\t$class\t$valid\t";
	    printf "%.4f\t%.4f\t%s\n",$het, $hetSE, $func;
	}
	%loc=();
	$strand=$alleles=$class=$func=$valid="";
	$rsId=$score=$het=$hetSE=0;
    }
}
