#!/usr/bin/perl

# script to detect boundaries of RNA secondary structure from base pair probability matrix.

use strict;
use Getopt::Long;
use Pod::Usage;
no warnings;

# input parameters default values
my $minLen=10; # minimum length of the segment
my $flanking=10; # 5' and 3' flanking sequence so that always information about separation is available
my $dotfile; # input dot plot matrix
my $relibmat; # input petfold relibability matrix
my $pnull=0.0005; # minimal probability
my $verbose;
my $help = 0;

use constant INFINITE => 1e-10;

GetOptions ("flanking=i" => \$flanking,    # numeric
            "minLen=i" => \$minLen, # numeric
            "dotfile=s" => \$dotfile, # string
            "relibmat=s" => \$relibmat, # string
    	    "pnull=f" => \$pnull, # float
            "verbose"  => \$verbose,
            "help|?"  => \$help
        ) or pod2usage(2);

pod2usage(1) if $help;
pod2usage(1) if (!defined $dotfile && !defined $relibmat);

# extract the base pair probabilities of ensemble structure from .ps file
my $seq; # extract sequence from the dot.ps fil
my $seqflag=0;
my @bpp; # array to save the base pair probabilities
my $seqLen; # sequence length


# read bp from the dot file or relibability matrix
# initialize matrix  
my @dt_matrix; # to save base pair probabilities
my @dt_matrix_pp; # to save base pair probabilities

if (defined $dotfile) # if dotmatrix is given as input
{
 open(IN, $dotfile) or die ("Error in opening the input $dotfile file");
 while(<IN>)
 {
  # read the input sequence
  if ($seqflag==1 && $_=~/def/){$seqflag+=1;}
  if ($seqflag==1) {$seq.=$_;$seq=~s/\\//g;$seq=~s/\s+//g;}
  if ($_=~/^\/sequence/){$seqflag=1;}

  # read ensemble base pair probabilities
  if ($_=~/ubox$/ && $_!~/sqrt/)
  { push(@bpp, $_); }
 }
 close IN;

 $seqLen=length($seq); 
}

if (defined $relibmat) # if petfold reliability matrix is given as input
{
 open(IN, $relibmat) or die ("Error in opening the input $relibmat file");
 $seqLen=<IN>;
 # read ensemble base pair probabilities
 my $i=1;
 while(<IN>)
 {
  if($_=~/^\s+/){last;}
  my @ss=split(/\s+/, $_);
  # store the base pair reliabilities from the upper matrix 
  for(my $j=$i+1;$j<=scalar(@ss);$j++)
  { 
    if($ss[$j-1]>0)
    {
     my $tmp=$i." ".$j." ".$ss[$j-1];
     push(@bpp, $tmp);
    }
  }
  $i++;
 }
 close IN;
}

# initialize matrix  
for(my $i=0;$i<$seqLen;$i++)
{
  for(my $j=0;$j<$seqLen;$j++)
  {
    $dt_matrix[$i][$j] = INFINITE;
    $dt_matrix_pp[$i][$j] = 0;
  }
}

# assign base pair probabilities scores
# REF: PMID:17432929
foreach my $d (@bpp)
{
  my @s=split(/\s+/, $d);
  my $i=$s[0]-1;
  my $j=$s[1]-1;
  $dt_matrix[$i][$j]=&max(INFINITE, log($s[2]*$s[2]/$pnull)/log(1/$pnull)), if(defined $dotfile);
  $dt_matrix[$i][$j]=&max(INFINITE, log($s[2]/$pnull)/log(1/$pnull)), if(defined $relibmat);  
  $dt_matrix_pp[$i][$j] = $s[2]*$s[2], if(defined $dotfile);
  $dt_matrix_pp[$i][$j] = $s[2], if(defined $relibmat);
  $dt_matrix[$j][$i]=$dt_matrix[$i][$j];
  $dt_matrix_pp[$j][$i]=$dt_matrix_pp[$i][$j];
}

my $D_kl_w=0;
my %all_kl; # dictionary to save only Dkl values
my %all_kl_full; # dictionary to save all output details

# fitness function to find the segment [ k, l ] with the highest mass
# of self-contained base pairings inside of the larger sequence
for(my $k=$flanking;$k<($seqLen-$minLen-$flanking);$k++) 
{
  for(my $l=($k+$minLen-1);$l<$seqLen-$flanking;$l++)
  {
    my $I_kl=0;
    my $O_kl=0;
    for(my $m=$k;$m<=$l;$m++)
    {
      my $I_k=0;
      my $O_k=0;
      for(my $n=0;$n<$seqLen;$n++)
      { 
        # sum of inside base pair probabilities	
    	if ($n>=$k && $n<=$l)
	    { $I_k+=$dt_matrix[$m][$n];}
        # sum of outside base pair probabilities
    	else
    	{ $O_k+=$dt_matrix[$m][$n];}
      }
      $I_kl+=log(&min($I_k,1));
      $O_kl+=log(1-&min($O_k,1-INFINITE));
    }
    my $S_kl=0;
    for(my $m=$k-$flanking;$m<$k;$m++)
    {
      my $S_k=0;
      for(my $n=$l+1;$n<$l+$flanking;$n++)
      {
        # sum of separating base pair probabilities
        $S_k+=$dt_matrix[$m][$n];
      }
      $S_kl+=log(1-&min($S_k,1-INFINITE))
    }

    my $len_kl=$l-$k+1;
    $D_kl_w = $I_kl/$len_kl + $O_kl/$len_kl + $S_kl/$flanking;

    $all_kl_full{"$k-$l"} = sprintf("%i\t%i\t%f", $k,$l,exp($D_kl_w));
    $all_kl{"$k-$l"} = exp($D_kl_w);
  }
}

# save the inside and outside difference for each segment analysed
my $toutfile;
if (defined $dotfile)
{
$toutfile=$dotfile;
$toutfile=~s/dp\.ps/segments.txt/g;
}
if (defined $relibmat)
{
$toutfile=$relibmat;
$toutfile=~s/\.txt/_segments.txt/g;
}

# save the Scores in a table format 
print "start_k\tend_l\tD_kl_w\n";
# sort output based on dkl values (higher to lower)
foreach my $key (sort { $all_kl{$b} <=> $all_kl{$a} } keys %all_kl)
{
  print $all_kl_full{$key},"\n";
}

### functions

# max value function
sub max {
   my ($a, $b) = @_;
   if ($a>=$b) {return $a;}
   else {return $b;}
}
# min value function
sub min {
   my ($a, $b) = @_;
   if ($a<$b) {return $a;}
   else {return $b;}
}

__END__

=head1 NAME

rnabound - script to detect boundaries of RNA secondary structure from base pair probability matrix.

=head1 SYNOPSIS

  rnabound.pl [OPTIONS] --dotfile=<input> > output.txt

=head1 DESCRIPTION

The script reads base pair probability matrix as either dot plot file (.ps) from RNAfold program or
base pair reliability values from PETfold. 

=head1 OPTIONS

=over 4

=item B<--dotfile> <string>

base pair probability matrix (postscript file) from RNAfold

=item B<--relibmat> <string>

base pair reliability matrix (text file) from PETfold

=item B<--minLen> <int>

minimum size of the subsequence [k,l] to be considered for fitness function (default: 10)

=item B<--flanking> <int>

size of the flanking regions to be considered for spaining base pair probabilities for the fitness function (default: 10)

=item B<--pnull> <float>

threshold for base pair probabilities (default: 0.0005)

=back

=head1 EXAMPLE

Produce PostScript dot plot for subsequence from position 10 to 50:
C<rnabound.pl --dotfile examples/tRNA_rnafold_pp_w100.ps>

=cut
