#!/usr/bin/env perl
#
# reads context output (modified for all flanks)
# ranks flanks (i, j) according to ddG
# writes each entry to a single TSV file and
# adds up ranks in single file
#
# unlist file is (tab-separated) TSV with header
# 
# seq_i seq_j identity
#
# seq_j will be excluded
#
#    Copyright (C) 2015  Nikolai Hecker
#
#   This program is free software: you can redistribute it and/or modify
#   it under the terms of the GNU General Public License as published by
#   the Free Software Foundation, either version 3 of the License, or
#   (at your option) any later version.
#
#   This program is distributed in the hope that it will be useful,
#   but WITHOUT ANY WARRANTY; without even the implied warranty of
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#   GNU General Public License for more details.
#
#   You should have received a copy of the GNU General Public License
#   along with this program.  If not, see <http://www.gnu.org/licenses/>.
# 
############################################################################
use strict;
use Getopt::Long;
use POSIX;
#################################
# sort entries according to dGG #
#################################
sub compdGG 
{
	my %h_a = %{ $a };
	my %h_b = %{ $b };

	if($h_a{'ddG'} < $h_b{'ddG'})
	{
		return 1;
	}
	elsif($h_a{'ddG'} == $h_b{'ddG'})
	{
		return 0;
	}
	else
	{
		return -1;
	}
}

##############################
# assigns ranks to flanks    #
# input array must be sorted #
# assigns average on tie     #
# ranks start with 1         #
#                            #
# returns reference to array #
# comprising ranks           #
##############################
sub assign_ranks
{
	my $a_ref = $_[0];
	my @a = @{ $a_ref };

	my $r_rank = 1;
	my @ranks = ();

	#get ddG array
	my @a_ddg = ();

	for(my $i = 0; $i < @a; $i++)
	{
		my %h = %{ $a[$i] };
		$a_ddg[$i] = $h{'ddG'};
	}

	for( my $i = 0; $i < @a_ddg; $i++)
	{
		
		#case: equal ranks
		if($i+1 <  @a_ddg && $a_ddg[$i] == $a_ddg[$i+1])
		{
			my $pos = $i;
			my $last_pos = $i;
			my $sum = $r_rank;
		
			$r_rank++;
			$i++;
			
			while( $i < @a_ddg && $a_ddg[$i-1] == $a_ddg[$i])
			{
				$sum += $r_rank;
				$last_pos = $i;
				$r_rank++;
				$i++;
			}
			$i = $last_pos; #i already incremented by for loop
			my $av_rank = $sum/($last_pos-$pos+1);

			for(my $j = $pos; $j <= $last_pos; $j++)
			{
				$ranks[$j] = $av_rank;
			}
		}
		else #simple case
		{
			$ranks[$i] = $r_rank;
			$r_rank++;
		}
	}

	return \@ranks;
}

my $inf = "";
my $out = "";
my $listf = "";
my $unlistf = "";

#parse options
my $usage = "Usage:\n\t$0 [OPTIONS]\n";
$usage .= "\t\t-i|input-file\t\tFILENAME\tinput file: RNAcop output file\n";
$usage .= "\t\t-o|output-title\t\tSTRING\t\toutput title\n";
$usage .= "\t\t-l|list-file\t\tFILENAME\tlist file: entries to be included (optional)\n";
$usage .= "\t\t-u|unlist-file\t\tFILENAME\tunlist file: entries to be excluded (optional)\n";


my $opts = GetOptions (
    'i|input-file=s' => \$inf,
	'o|output-title=s' => \$out,
	'l|list-file=s' => \$listf,
	'u|unlist-file=s' => \$unlistf
    );

if( $inf eq "" || $out eq "")
{
	print $usage;
	print "Too few arguments.\n";
	exit;
}


my $fh;
my $fh_out;

#load accessions to be included
my %h_incl = ();
if($listf ne "")
{
	print "...loading entries to be included.\n";
	open($fh, "<$listf") || die "Can't open '$listf'\n.";

	while(<$fh>)
	{
		my $line = $_;
		chomp($line);

		$h_incl{$line} = 1;
	}
	close($fh);
}

#load accessions to be excluded
my %h_excl = ();
if($unlistf ne "")
{
	print "...loading entries to be excluded.\n";
	open($fh, "<$unlistf") || die "Can't open '$unlistf'\n.";

	my $first = 1;
	while(<$fh>)
	{
		if($first)
		{
			$first = 0;
			next;
		}
		my $line = $_;
		chomp($line);

		my @s = split("\t", $line);
		
		$h_excl{$s[1]} = 1;
	}
	close($fh);
}


#process context output
open($fh, "<$inf") || die "Can't open '$inf'.\n";

my $id = "";
my $loc_s = 0;
my $loc_e = 0;
my $strc_s = 0;
my $strc_e = 0;
my @flanks = (); #array -> hash (i , j, ddG)

my $process = 1;
my $num_seqs = 0;

while(<$fh>)
{
	my $line = $_;
	chomp($line);
    
    #new entry
    if( $line =~ /^>([^:]*)/)
    {
		
        $id = $1;
        $loc_s = $2;
        $loc_e = $3;
				   
		if( $listf ne "" && !defined($h_incl{$id}) )
		{
			$process = 0;
		}
		elsif($unlistf ne "" && defined($h_excl{$id}) )
		{
			$process = 0;
		}
    }
	
	#structure pos
	if( $line =~ /^desired structure located\s+(\d+)\.\.(\d+)$/ )
	{
		$strc_s = $1;
		$strc_e = $2;
	}

	#flank
	if( $line =~ /^i:\s*(\d+)\s+j:\s*(\d+)\s+ddG:\s*(\-?\d+\.?\d*)\s*$/)
	{
		if($process)
		{
			my $i = $1;
			my $j = $2;
			my $ddG = $3;
			
			my $l = $strc_s - $i;
			my $r = $j - $strc_e;
			
			my %th = ( 
				'left' => $l,
				'right' => $r,
				'ddG' => $ddG
				);
			
			push(@flanks, \%th);
		}
	}

	#end of entry
	if( $line =~ /^Best subsequence (\d+)..(\d+), ddG and probability of substructure\s+(\-?\d+\.?\d*)\s+(\-?\d+\.?\d*)/)
	{
		if($process == 1)
		{
			print "...sorting ('$id')\n";

			#sort flanks according to ddG / assign ranks
			my @s_flanks = sort compdGG  @flanks;
			my @ranks = @{ assign_ranks( \@s_flanks ) };

			#output for sequence
			my @lefts = ();
			my @rights = ();
			my @ddGs = ();

			for(my $i=0; $i < @s_flanks; $i++)
			{
				my %h = %{ $s_flanks[$i] };
				$lefts[$i] = $h{'left'};
				$rights[$i] = $h{'right'};
				$ddGs[$i] = $h{'ddG'};
			}

			my $outf = "${out}_${id}_rddG.tsv";
			open($fh_out, ">$outf") || die "Failed to open '$outf' for write access.\n";
			
			print $fh_out "l5prime\tl3prime\tddG\trank\n";
			for(my $i=0; $i < @ranks; $i++)
			{
				print $fh_out "$lefts[$i]\t$rights[$i]\t$ddGs[$i]\t$ranks[$i]\n";
			}
			close($fh_out);
			print "Output written to '$outf'.\n";

			$num_seqs++;
		}

		#reset
		$id = "";
		@flanks = ();
		$loc_s = 0;
		$loc_e = 0;
		$strc_s = 0;
		$strc_e = 0;
		$process = 1;
	}
}
close($fh);
print "Done.\n";
