#!/usr/bin/perl -w

use warnings;
use Getopt::Std;
use strict;
use Pod::Usage;
use Data::Dumper;
my %opts;

getopt("abolnj", \%opts);

my @dataArray;
my $dataArrayIndex=0;
my $seq="";
my $seqLen=0;
my $job="data";

if(!defined($opts{l}) || !defined($opts{n}) || !defined($opts{o})){
  pod2usage(-verbose => 2);
  exit(0);
}
else{
  # assign name for the output files
  $job=$opts{j} if(defined($opts{j}));
  
  #Get Data from RNAplfold
  my $openEnergy;
  open(RFOLD,"$opts{o}");
  $openEnergy->{0}=0;
  while(my $line=<RFOLD>){
    next if($line=~/#/);
    chomp($line);
    my @data=split(/\s+/,$line);
    $openEnergy->{$data[0]}=$data[20];
    $seqLen++;
  }

  #Assign an initial value to all nucleotide position
  print $seqLen,"\n";
  for(my $i=0;$i<$seqLen;$i++)
  {
        my $hash;
	$hash->{a} = 1;
	$hash->{b} = 1;
	$hash->{pos} = $i;
	$hash->{mut} = "";
	push @{$dataArray[$i]},$hash;
  }

  #Start parsing the result file;
  while(my $line=<STDIN>){
    my @data=split(/\s+/,$line);
    if($data[0] =~/([0-9]+)/){
      my $pos = $1;
      $dataArrayIndex=$pos;
      my $hash;
      #the rest of the values are saved into the dataArray table
      $hash->{a} = $data[6];
      $hash->{b} = $data[10];
      $hash->{pos} = $pos;
      $hash->{mut} = $data[0];
      #we save the region
      push @{$hash->{regions}}, $data[4];
      push @{$hash->{regions}}, $data[8];
      push @{$dataArray[$dataArrayIndex]},$hash;
    }
  }

  #Now we are parsing the values
  #We need to make sure that we can detect the occurence of 
  #at least $opts{n} mutations in a stretch of $opts{l} nucleotides.
  my $current = 0;
  my @scoreArray;
  for(my $i=0; $i<$opts{l}; $i++){
    $scoreArray[$i] =0;
  }

  #initialize the regionArray // regionIntersectArray
  my @regionArray;
  my @regionIntersectArray;
  for(my $i=0; $i<@dataArray; $i++){
    $regionArray[$i] =0;
    $regionIntersectArray[$i] =0;
  }

  #We also initialize the file containing the data to be plotted by R
  print "Position\tNumber of disruptive SNP\n";
  open(CSVFILE,">$job.csv");
  print CSVFILE "Position distance_RNAplfold_p distance_RNAfold_p count distance_RNAplfold_threshold distance_RNAfold_threshold Disrupted_Region Disrupted_Intersect_Region opening length number \n";
  
  #We first look at the disrupted Region
  for(my $i=0; $i < scalar(@dataArray); $i++){
      foreach my $SNP (@{$dataArray[$i]}){
	#if SNP below threshold look at region
	if($SNP->{a} < $opts{a} && $SNP->{b} < $opts{b}){
	  my $maxStart=0; my $maxEnd=1e9;
	  foreach my $Region (@{$SNP->{regions}}){
	    $Region=~/(\d+)-(\d+)/;
	    (my $start, my $end)=($1,$2);
	    $maxStart = ($maxStart>$start ? $maxStart:$start);
	    $maxEnd   = ($maxEnd  < $end  ? $maxEnd  :$end);
	    for(my $i=$start; $i<=$end; $i++){
	      $regionArray[$i]+=1;
	    }
	  }
	  for(my $i=$maxStart; $i<=$maxEnd; $i++){
	    $regionIntersectArray[$i]+=1;
	  }
	}
      }
    }

  for(my $i=0; $i < scalar(@dataArray); $i++) {
    my $count=0;
    my $RstringToPrint;
    my $temp;$temp->{a}=1;$temp->{b}=1;
    $RstringToPrint.=$i." ";
    foreach my $SNP (@{$dataArray[$i]}){
      #print $i," ",$temp->{a}*$temp->{b}*$temp->{c}," ",$SNP->{a}*$SNP->{b}*$SNP->{c},"\n";
      if($temp->{a}*$temp->{b} > $SNP->{a}*$SNP->{b}){
	$temp->{a}   = $SNP->{a}  ;
	$temp->{b}   = $SNP->{b}  ;
	$temp->{pos} = $SNP->{pos};
	$temp->{mut} = $SNP->{mut};
      }
      if($SNP->{a} < $opts{a} &&
	 $SNP->{b} < $opts{b}){
	$count++;
      }
    }
    $RstringToPrint.=$temp->{a}." ".$temp->{b}." ";
    my $out=shift(@scoreArray);
    $current+=$count;
    $current-=$out;
    push @scoreArray, $count;
    #save info into the R data file
    $RstringToPrint.=$current." ".$opts{a}." ".$opts{b}." ".$regionArray[$i]." ".$regionIntersectArray[$i]." ".$openEnergy->{$i}." ".$opts{l}." ".$opts{n}." "."\n";
    print CSVFILE $RstringToPrint;
    if($current>=$opts{n}){
      printf("%4d-%1d\t%24d\n",$i-$opts{n},$i,$current);
    }
  }
}
close CSVFILE;

#Now we generate a file data.R to compile the data.csv file and
#produce a plot

open(RFILE,">$job.R");
print RFILE "pdf(\"$job.pdf\", width=12, heigh=6)";
print RFILE "\n";
print RFILE "data<-read.table(\"$job.csv\", header=T);";
print RFILE "\n";
print RFILE "maxScale<-1.2*max(c(max(data\$Disrupted_Intersect_Region)), max(data\$count))\n";
print RFILE "plot(data\$count ~ data\$Position,type=\"l\",xlab=\"Position\", ylab=\"Count\", ylim=c(0,maxScale))\n";
print RFILE "lines(data\$Disrupted_Intersect_Region ~ data\$Position,col=\"red\")\n";
print RFILE "lines(data\$open ~ data\$Position,col=\"blue\")\n";
print RFILE "\n";
print RFILE "legend(\"topleft\", legend=c(\"Count of disruptive SNP in the last $opts{l} nts\", \"Count of regions significantly disrupted by SNPs\", \"Opening Energy\"),col=c(\"black\",\"red\",\"blue\"),lty=1)\n";
#print RFILE "abline(h=$opts{n},col=\"black\")";
print RFILE "\n";
print RFILE "dev.off()\n";

`R CMD BATCH $job.R`;


=head1 NAME

selectSensRegion.pl parses the output of the RNAsnp scanning mode and returns
region sensitive to SNPs

=head1 SYNOPSIS

    selectSensRegion.pl -a 0.4  -b 0.1 -l 10 -n 3 -o <file_openingEnergy> < data.dat

=head1 DESCRIPTION

This script parses the output of the RNAsnp scanning mode and returns
region that are sensitive to SNPs. This is done by looking which
regions have a high number of disruptive SNPs. RNAsnp further produces
a pdf file containg the p-value profile computed by RNAsnp. Here it
should be said that RNAsnp automatically removes SNP with p-value
larger than 0.4 in the RNAplfold step and larger than 0.1 in the
RNAfold step. The script returns a list of positions where there are
more than -n SNPs in the last -l nts passing the threshold
set by -a and -b

=head1 Options

=over 12

=item C<-a>

Set threshold on the p-value of the distance measure returned in
the prefiltering step (RNAplfold) of RNAsnp

=item C<-b>

Set threshold on the p-value of the distance measure returned in
the second step (RNAfold) of RNAsnp

=item C<-c>

Set threshold on the p-value of the correlation coefficient returned in
the prefiltering step (RNAfold) of RNAsnp

=item C<-l>

Set the span of the region of interest. This must be set.

=item C<-n>

Set the number of SNP below the threshold -a and -b in order to
report a region. This must be set.

=item C<-o>

Is the output file returned by RNAplfold with the option -W 200 -L 200 -u 20 -O

=item C<-j>
Name for the output files. Default "data"

=item

The input file is read through STDIN

=back

=head1 LICENSE

This is released under the Artistic 
License. See L<perlartistic>.

=head1 AUTHOR

Hakim Tafer <htafer@gmail.com>
Radhakrishnan Sabarinathan <sabari@rth.dk>

=head1 SEE ALSO

L<perlpod>, L<perlpodspec>

=cut
