#!/usr/bin/perl -w
# Copyright (C) 2007 Stefan E Seemann <seemann@genome.ku.dk>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Version: v1.0
# Created: 2007-09-21
# Modified: 2008-12-21
# NEW: merge thermodynamic probabilities of all input sequences in one matrix (saves memory and time)

use Getopt::Long;
use strict;

#EXTERNAL PROGRAMS
my $RNAfold = $ENV{'RNAfold_bin'};   # package ViennaRNA-1.6.2
my $Pfold = $ENV{'Pfold_bin'};   # Pfold

&usage() if $#ARGV<1;
&usage1() if ! -e "$RNAfold/RNAfold";
&usage1() if ! -e "$Pfold/fasta2col";

#CONSTANTS
my $KT = 0.616320775;
my $EVOCON_BP = 0.9;   # minimal base pair reliability for evolutionary constraints
my $EVOCON_SS = 1;     # minimal single stranded reliablity for evolutionary constraints
my $ALPHA = 0.2;       # weighting factor for single stranded probabilities (alpha <= 0.5)
my $BETA = 1;          # weighting factor for thermodynamic overlap
my $GAP = 0.25;        # maximal allowed percent of gaps in an alignment column; deletion of column if rate is equal or higher

my $INFINITE = 1000;
my $SETRULES = 0;      # usage of rule probabilities of the SCFG
my $SETEVO = 1;	       # usage of evolutionary reliabilities (performed by Pfold)
my $SETTHERMO = 1;     # usage of thermodynamic probabilities (performed by RNAfold)
my $WAR = 0;	       # fasta format output
my $PPFILE = 0;	       # name of pp-file
my $SETTREE = 0;       # given phylogenetic tree in Newick tree format
my ($SETSTRUCT,$fasta);

GetOptions('fasta=s' => \$fasta,
	   'setrules' => \$SETRULES,
	   'setevo' => \$SETEVO,
	   'setthermo' => \$SETTHERMO,
	   'setstruct=s' => \$SETSTRUCT,
	   'settree=s' => \$SETTREE,
	   'setevocon_bp=s' => \$EVOCON_BP,
	   'setevocon_ss=s' => \$EVOCON_SS,
	   'setalpha=s' => \$ALPHA,
	   'setbeta=s' => \$BETA,
	   'setgap=s' => \$GAP,
           'war' => \$WAR,
	   'ppfile=s' => \$PPFILE);

&usage2() if ! -e "$fasta";

# create unique ID for temporary files (Attention: RNAfold do not create a dotplot file if _ or - is contained in the fasta header)
my $pid = "$$"; 			# current PID
$pid = "a" . $pid; 			# attach a letter to make it a string
my @l = split(//,$pid);	                # split and get the last 4 numbers (or the last 3 with the 'a')
$l[$#l-3] = "0" if $l[$#l-3] eq "a";
my $ID = $l[$#l-3] . $l[$#l-2] . $l[$#l-1] . $l[$#l]; 	# 4digit ID of this process

my %bp = (
	  'GU' => 1, 'UG' => 1,
	  'AU' => 1, 'UA' => 1,
	  'GC' => 1, 'CG' => 1
	  );

my %scfg = ($SETRULES) ? (
			  'S' => { 'LS' => 0.868534, 'L' => 0.131466 },
			  'F' => { 'dFd' => 0.787640, 'LS' => 0.212360 },
			  'L' => { 's' => 0.894603, 'dFd' => 0.105397 }
			  ) :
                          (
			  'S' => { 'LS' => 0, 'L' => 0 },
			  'F' => { 'dFd' => 0, 'LS' => 0 },
			  'L' => { 's' => 0, 'dFd' => 0 }
			  );

###############################
# MAIN          #########
###################

my ($align_ref, $org_ref) = get_alignment($fasta);
#print "align_ref:\n";map { map { print "$_" } @{$_}; print "\n"} @{$align_ref};

my ($align_new_ref, $gap_col_ref) = delete_gap_columns($align_ref);
#print "align_new_ref:\n";map { map { print "$_" } @{$_}; print "\n"} @{$align_new_ref};

my ($paired_tree_ref,$single_tree_ref,$struct) = get_prob_tree($align_new_ref,$org_ref,$SETTREE,$SETEVO);
#if( $SETEVO ) {
#    print "paired_tree_ref:\n";
#    map { map {print "$_\t"} @{$_}; print "\n" } @{$paired_tree_ref};print "\n";
#    print "single_tree_ref:\n";
#    map {print "$_\t"} @{$single_tree_ref};print "\n";
#}
my ($evocon_bp_left_ref,$evocon_bp_right_ref,$evocon_ss_ref) = get_constr_tree($paired_tree_ref,$single_tree_ref,$struct);
my $evocon = "." x ($#{$align_new_ref->[0]}+1);
$evocon = get_constraint_string($evocon_bp_left_ref,$evocon_bp_right_ref,$evocon_ss_ref,$#{$align_new_ref->[0]}+1) if $SETEVO;

my $paired_seq_ref = get_prob_paired_seq($align_new_ref,$evocon,$SETTHERMO); 
my $single_seq_ref = get_prob_unpaired_seq($align_new_ref,$paired_seq_ref,$SETTHERMO);
#if( $SETTHERMO ) {
#    print "\npaired_seq_ref: open bond (upper triangle)\n";
#    map { map { map { print "$_\t"} @{$_}; print "\n" } @{$_}; print "\n" } @{$paired_seq_ref};print "\n";
#    print "single_seq_ref:\n";
#    map{ map { print "$_\t" } @{$_}; print "\n" } @{$single_seq_ref};print "\n";
#}

my $single_pet_ref = get_pet_single($single_tree_ref,$single_seq_ref);
my $paired_pet_ref = get_pet_paired($paired_tree_ref,$paired_seq_ref);

$evocon = mod_setstruct($SETSTRUCT, $gap_col_ref) if $SETSTRUCT;
my $consstr_ref = test_consstr($align_new_ref,$evocon);
#map { print $_." " } @{$consstr_ref};print "\n";

my $T_ref = cyk($align_new_ref,\%scfg,$paired_pet_ref,$single_pet_ref,$consstr_ref);
#print "T:\n";
#my $i=-1;
#map { map { map { print "$_->[0]\t" } @{$_}; print "\n" } @{$_}; $i++; print "this was i=$i\n" } @{$T_ref};print "\n";

my $seqlg=$#{$align_new_ref->[0]}+1;
my $ss;
if( $T_ref->[0][$seqlg-1][0][0]!=$INFINITE ) {
    $ss = backtracking($T_ref);
    $ss = include_gap_columns($ss,$gap_col_ref);
}

###############################
# OUTPUT        #########
###################

create_ppfile($paired_pet_ref,$single_pet_ref,$PPFILE) if $PPFILE;
	
if( $WAR ) {
    system("cat $fasta");
    print ">structure\n";
    $ss =~ s/-/\./g;
    print $ss."\n";
}
else {
    if( $SETEVO ) {
	print "Pfold RNA sec.struct.:\t\t".include_gap_columns($struct,$gap_col_ref)."\n";
	print "Constraints:\t\t\t".include_gap_columns($evocon,$gap_col_ref)."\n";
    }    
    print "PETfold RNA sec.struct.:\t$ss\n" if $T_ref->[0][$seqlg-1][0][0]!=$INFINITE;
    print "Score_{model,structure}(tree,alignment) = $T_ref->[0][$seqlg-1][0][0]\n";
    # Reliability = Score/{seqence length without gaps}
    print "Reliability_{model,structure}(tree,alignment) = " . $T_ref->[0][$seqlg-1][0][0]/$seqlg . "\n";
}

###############################
# FUNCTIONS     #########
###################

# to modify the desired structure as constraints and
# to test the accordance of desired structure to the gap columns (gap must be a unpaired position!!!)
sub mod_setstruct
{
    my ($setstr, $gap_col) = @_;

    $setstr =~ s/\./x/g;
    $setstr =~ s/\(/</g;
    $setstr =~ s/\)/>/g;
    my $tmpstr=$setstr;

    foreach my $k ( reverse @{$gap_col} ){
	die("Consensus structure invalid: wrong length!\n") if $k >= length($tmpstr);
	if( substr($tmpstr,$k,1) ne "x" ){
	    die("The desired RNA structure can not be evaluated! Position $k has to be unpaired because an alignment gap.\n");
	}else{
	    $setstr=substr($setstr,0,$k).substr($setstr,$k+1);
        }
    }

    return $setstr;
}

# to test the correctness of the input consensus structure and sign the related bases of all basepairs
sub test_consstr
{
    my ($align_ma,$consstr) = @_;
    my @ss;

    my $bp = -1;
    my @consstr = split "",$consstr;

    die("Consensus structure invalid: wrong length!\n") if $#consstr!=$#{$align_ma->[0]};

    for(my $i=0; $i<$#consstr+1; $i++) {
	if( $consstr[$i] eq '<' ) {
	    push @ss, ++$bp;
	}
	elsif( $consstr[$i] eq '>' ) {
	    push @ss, $bp--;
	}
	elsif( $consstr[$i] eq 'x' ) {
	    push @ss, -1;
	}
	else{
	    push @ss, $INFINITE;
	}
    }

    die("Consensus structure invalid: number of left and right parenthesis unequal!\n") if $bp!=-1;

    return \@ss;
}

# to extract paired and single probabilities of tree
sub get_prob_tree
{
	my ($align_ma,$org_ve,$SETTREE,$SETEVO) = @_;
	my ($l,$i,$j,$line,@paired,@single);
	my $struct = "";

	if($SETEVO) {
	    # write fasta file
	    my ($fafile, $seqname_ve) = get_fasta_file($align_ma,$org_ve);
	    # to run pfold to get the phylogenetical probabilities of the input alignment 
	    my $colfile = "t$ID.col";
	    # to change the format
	    system "$Pfold/fasta2col $fafile | sed 's/arbitrary/RNA/g' > $colfile";

	    # to calculate pairwise and single probabilities depending on the tree using Felsensteins dynamic programming for phylogenetic trees
	    my $ppfile = "t$ID.pp";
	    my $pfoldout = "t$ID.out";
	    my $mytree = "t2$ID.col";
	    my $mytree_ml = "t3$ID.col";
	    # phylogenetical tree is given
	    if($SETTREE) {
		system "echo \"$SETTREE:0\" | $Pfold/newick2col > $mytree";
		# branch lengths are not given and are estimated by maximum likelihood
		if(index($SETTREE,":")==-1) {
			system "cat $colfile | $Pfold/nohead >> $mytree";
			system "$Pfold/mltree $Pfold/scfg.rate $mytree 2> /dev/null > $mytree_ml";
			system "$Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm $mytree_ml 2> /dev/null > $pfoldout";
		}
		# branch lengths are given
		else {
			#system "$Pfold/drawphyl --mult=5 $mytree > tree2.ps";
			system "$Pfold/scfg --treefile --ppfile $mytree $ppfile $Pfold/article.grm $colfile > $pfoldout";
		}
	    }
	    # phylogenetical tree is not given and is estimated by neighbour joining approach
	    else {
	    	system "$Pfold/findphyl $Pfold/scfg.rate $colfile | $Pfold/mltree $Pfold/scfg.rate 2> /dev/null | $Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm 2> /dev/null > $pfoldout";
	    }

	    open IN, $ppfile || die("Can not open the file!\n");
	    $l=<IN>;
	    # write paired probabilities in a 2D-array
	    for($i=0; $i<$l; $i++) {
		$line = <IN>;
		my @row = split " ", $line;
		#map { $_=log($_)/log(2) if $_!=0; $_=$INFINITE if $_==0 } @row;
		push (@paired, \@row);
	    }
	    <IN>;
	    # write single probabilities in a 1D-array
	    $line = <IN>;
	    @single = split " ", $line;
	    close IN;

	    # write Pfold predicted RNA secondary structure in an string
	    $struct = get_pfold_sec_struct($pfoldout);

	    system "rm -f $colfile $ppfile $fafile $pfoldout $mytree $mytree_ml";
	}
	else {
	    $l=$#{$align_ma->[0]}+1;
	    
	    for($i=0; $i<$l; $i++) {
		my @row;
		for($j=0; $j<$l; $j++) {
		    push @row, 0;
		}
		push (@paired, \@row);
		push (@single, 0);
	    }
	}

	return \@paired,\@single,$struct;	
}

# to extract the Pfold predicted RNA secondary structure from the Pfold output file
sub get_pfold_sec_struct
{
    my ($pfoldout) = @_;
    my (@l);
    my $status = 0;
    my $string = "";

    open IN, $pfoldout || die("Can not open the file!\n");
    while(<IN>) {
	$status = 1 if /RNA/;
	if( /^; \*/ && $status ) {
	    return $string;
        }
	next if /^;/;
        next unless $status;
	
	@l = split " ", $_;
        if( $l[4] eq '.' ) {
	    $string .= ".";
        }
	else{
	    if( $l[3]<$l[4] ) {
		$string .= "(";
	    }
	    else{
		$string .= ")";
	    }
	}
    }
    close IN;

    return $string;
}

# to get the indizes of left and right bases of basepairs and singlestranded bases which are evolutionary highly conserved
sub get_constr_tree
{
    my ($paired_tree_ma,$single_tree_ve,$struct) = @_;
    my (@struct,$i,$l);
    my @stack = (0);

    my @bp_left = ();
    my @bp_right = ();
    my @ss = ();

    if( $struct ne "" ) {
	@struct = split "",$struct;

	for($i=0; $i<$#struct+1; $i++){
	    if( $struct[$i] eq "(" ) {
		unshift @stack, $i;
	    }
	    elsif( $struct[$i] eq ")" ) {
		$l = shift @stack;
		if( $paired_tree_ma->[$l][$i] > $EVOCON_BP ) {
		    unshift @bp_left, $l;
		    unshift @bp_right, $i;
		}
	    }
	    else{
		unshift @ss, $i if $single_tree_ve->[$i] > $EVOCON_SS;
	    }
	}
    }

    return \@bp_left,\@bp_right,\@ss;
}

# to extract the alignment in a matrix (lines: sequences, rows: baseposition) from fasta-file
sub get_alignment
{
	my ($fasta) = @_;
	my (@align,@org);
	my $seq = "";

	open IN, $fasta || die("Can not open the file!\n");
	$_ = <IN>;
	chomp($_);
	s/^>//;
	push @org, $_;  
	while( <IN> ) {
		chomp($_);
		if( /^>/ ) {
			s/^>//;
			push @org, $_;
			my @row = split "", $seq;
                	push (@align, \@row);
			$seq = "";
			next;
		}
		$_=~ tr/\.gcautT/-GCAUUU/;
		$seq .= $_;
	}
	close IN;
	my @row = split "", $seq;
        push (@align, \@row);

	return \@align, \@org;	
}

# to delete columns in an alignment with more or equal $GAP% of gaps (25% in Pfold)
sub delete_gap_columns
{
	my ($align_ma) = @_;
	my ($i,$j,$nr,@d,@align_ma_new,$start,$k);

	my $seqnr=$#{$align_ma}+1;
        my $seqlg=$#{$align_ma->[0]}+1;

	# to find columns where gap rate is higher and equal as $GAP%
	for( $j=0; $j<$seqlg; $j++ ) {
		$nr = 0;
		for( $i=0; $i<$seqnr; $i++ ) {
			$nr++ if $align_ma->[$i][$j] eq "-";
		}
		push @d, $j if $nr/$seqnr>=$GAP;
	}

	# to delete these columns
	for( $i=0; $i<$seqnr; $i++ ) {
		my $subseq = "";
		$start = 0;
		for( $j=0; $j<$#d+1; $j++ ) {
			for( $k=$start; $k<$d[$j]; $k++ ) {
				$subseq .= $align_ma->[$i][$k];
			}
			$start = $d[$j]+1;
		}
		for( $k=$start; $k<$seqlg; $k++ ) {
                	$subseq .= $align_ma->[$i][$k];
                }
		my @row = split "", $subseq;
		push (@align_ma_new, \@row);
	}

	return \@align_ma_new,\@d;
}

# to add gaps in a string
sub include_gap_columns {
	my ($ss,$gap_col_ve) = @_;

	my @gaps_sorted = sort {$a<=>$b} @{$gap_col_ve};
	my @ss = split "", $ss;
	my $ss_new = "";
	my $offset = 0;

	for( my $i=0; $i<=$#ss+1; $i++ ) {
		foreach( @gaps_sorted ) {
			if( $_==$i+$offset ) {
				$ss_new .= "-";
				$offset++;
			}
		}
		$ss_new .= $ss[$i] if defined $ss[$i];
	}

	return $ss_new;
}

# to write a fasta file
sub get_fasta_file {
	my ($align_ma,$org_ve,$evocon,$gapid2origid_ma) = @_;
	my ($i,$n,$evocon_gap_free,@name);

	my $seqnr=$#{$align_ma}+1;

	my $fname = "t$ID.fa";
	open(file_local, ">$fname");
	for( $i=0; $i<$seqnr; $i++ ) {
		$n = ($org_ve) ? $org_ve->[$i] : $ID.$i;
		push @name, $n;
		print file_local ">$n\n";
		map{ print file_local "$_" } @{$align_ma->[$i]};
		print file_local "\n";
		# include constraint string
		if( defined $evocon ) {
		    # adapt constraint string to the gap free sequence
		    $evocon_gap_free = "";
		    map { $evocon_gap_free .= substr($evocon,$_,1) } @{$gapid2origid_ma->[$i]};
		    print file_local $evocon_gap_free . "\n";
		}
	}
	close(file_local);

	return $fname,\@name;
}

# to extract the alignment without gaps in a matrix and keep the index relation between original and gap free sequences
# return alignment matrix and 
# matrix reference which columns are sequences, rows are indizes of gap free sequences and entries are related indizes of original sequences
sub get_align_without_gaps
{
	my ($align_ma) = @_;
	my ($i,$j,$k,@align_gap_free,$fname,@d);
	
	my $seqnr=$#{$align_ma}+1;
        my $seqlg=$#{$align_ma->[0]}+1;	

	for( $i=0; $i<$seqnr; $i++ ) {
		my $subseq = "";
		$k = 0;
		for( $j=0; $j<$seqlg; $j++ ) {
			if( $align_ma->[$i][$j] ne "-" ) {
				$d[$i][$k++] = $j;
				$subseq .= $align_ma->[$i][$j];				
			}
		}
		my @row = split "", $subseq;
                push (@align_gap_free, \@row);
	}

	return \@align_gap_free,\@d;
}

# to extract basepair probabilities
# calculate energy distributions without gaps, use for gaps the average of probabilities in column
sub get_prob_paired_seq
{
	my ($align_ma,$evocon,$SETTHERMO) = @_;
	my (@prob,@l,$i,$j,$m,$k,$b1,$b2,@gappos,$sum,$seqgf);

	my $seqnr=$#{$align_ma}+1;
	my $seqlg=$#{$align_ma->[0]}+1;

	# to initialise 1 matrix with zero items
	# upper triangle with pairing probabilities of open bonds (RNAfold -p)
	for( my $i=0; $i<$seqlg; $i++) {
		for( my $j=0; $j<$seqlg; $j++) {
		    $prob[$i][$j] = 0;			
		}
	}

	return \@prob unless $SETTHERMO;

	# to calculate basepair probabilities: Pr[(A^i_u,A^{i+j-1}_u)|s_u]
	# write fasta file without gaps
	my ($align_gap_free_ma, $gapid2origid_ma) = get_align_without_gaps($align_ma);
	my ($fname, $seqname_ve) = get_fasta_file($align_gap_free_ma,0,$evocon,$gapid2origid_ma);

	# call RNAfold -p
	system "$RNAfold/RNAfold -p -C < $fname > /dev/null";
	my $cmd = "cat ";
	map {$cmd.=$_."_dp.ps "} @$seqname_ve;
	system $cmd . "> t$ID.dotp.ps";

	# fill matrix with probabilities of folding energies predicted by RNAfold
	open(file_local, "t$ID.dotp.ps");
	my $start = 0;
	my $seq = -1;
	while( <file_local> ) {
	    if( /%data starts here/ ) {
		$start = 1;
		$seq++;
		next;
	    }
	    $start = 0 if /showpage/;
	    next unless $start;
	    
	    @l = split " ";
	    # fetch small bug in RNAfold
	    $seqgf=$#{$align_gap_free_ma->[$seq]}+1;
	    next if $l[0]>$seqgf || $l[0]<1 || $l[1]>$seqgf || $l[1]<1 || $l[2]==0;
	    
	    # RNAfold returns the square roots of the base pair probabilities sqrt{P(i,j)}
	    $prob[ $gapid2origid_ma->[$seq][$l[0]-1] ][ $gapid2origid_ma->[$seq][$l[1]-1] ] += $l[2]*$l[2] if $l[3]=~/ubox/;
	}
	close file_local;
	$cmd = "rm -f ";
	map {$cmd.=$_."_dp.ps ".$_."_ss.ps "} @$seqname_ve;
	system $cmd;
	system "rm -f t$ID.dotp.ps $fname";

	# helping matrix counts for each column of alignment the number of gaps 
	for( $i=0; $i<$seqlg; $i++ ) {
		$gappos[$i] = 0;
		for( $m=0; $m<$seqnr; $m++ ) {
			$gappos[$i]++ if $align_ma->[$m][$i] eq "-"; 
		}
	}	

	# devide the sum of probabilities in matrix by the amount of sequences (without gaps)
	for( $i=0; $i<$seqlg; $i++) {
		for( $j=$i+1; $j<$seqlg; $j++) {
			$prob[$i][$j] /= $seqnr-$gappos[$i];
		}
	}	

	return \@prob;
}

# to compute unpaired probabilities as
# Prob_unpaired(i) = 1 - SUM_j{Prob_paired(i,j)}
sub get_prob_unpaired_seq
{
	my ($align_ma,$paired_seq_vema,$SETTHERMO) = @_;
	my ($i,$j,$k,@prob,$sum);

	my $seqlg=$#{$align_ma->[0]}+1;

	# initialise output matrix ( row: sequence, column: base position )
	for( $j=0; $j<$seqlg; $j++) {
	    $prob[$j] = 0;
	}

	return \@prob unless $SETTHERMO;

	for( $i=0; $i<$seqlg; $i++) {
		$sum = 0;
		for( $j=$i+1; $j<$seqlg; $j++) {
	    		$sum += $paired_seq_vema->[$i][$j];
		}
		for( $j=0; $j<$i; $j++ ) {
	    		$sum += $paired_seq_vema->[$j][$i];
		}
		$prob[$i] = 1 - $sum;
    	}

	return \@prob;
}

# to calculate the optimal secondary structure from alignment, tree and model (SCFG) by CYK-algorithm
# using phylogenetical and energy folding information
sub cyk
{
	my ($align_ma,$scfg_haha,$paired_pet_ma,$single_pet_ve,$consstr_ve) = @_;
	my (@T,$t,$i,$j,$prod,$s,$k,$pr,$bp);

	my $seqnr=$#{$align_ma}+1;
	my $seqlg=$#{$align_ma->[0]}+1;

	# to initialise cube T which holds arrays 
	# column($i=1,..,$seqlg+1-$j): base position;
	# row($j=1,...,$seqlg): length of subsequence;
	# depth($t): non terminals of SCFG: [0]=>'S', [1]=>'F', [2]=>'L'
	# array($a): [0]=>probability,
	#	     [1]=>reference to 1. child rule or infinite for rule L->s,
	#	     [2]=>reference to 2. child rule or infinite
	for( $i=0; $i<$seqlg; $i++ ) {
		for( $j=0; $j<$seqlg; $j++) {
			for( $t=0; $t<3; $t++) {
				$T[$i][$j][$t][0] = $INFINITE;
				$T[$i][$j][$t][1] = $INFINITE;
				$T[$i][$j][$t][2] = $INFINITE;
			}
		}
	}

	# to write first row of table T with probabilities of unpaired bases in the alignment
	for( $i=0; $i<$seqlg; $i++ ) {

	    #next if( $consstr_ve && $consstr_ve->[$i]!=-1 );
	    next if( $consstr_ve->[$i]!=$INFINITE && $consstr_ve->[$i]!=-1 );

	    # T[$i,0][L] := Pr_{rule}[L->s] x Pr_{single}[\vec{A}^i|T] x prod_{u=0}^{n-1} Pr_{unpaired}[A^i_u|s_u]
	    $T[$i][0][2][0] = $ALPHA * add($scfg_haha->{'L'}{'s'},2*$single_pet_ve->[$i]);

	    # T[$i,0][S] := Pr_{rule}[S->L] x T[$i,0][L]
	    $T[$i][0][0][0] = add($scfg_haha->{'S'}{'L'},$T[$i][0][2][0]);
	    $T[$i][0][0][1] = \$T[$i][0][2];
	}

	# to fill table T
	for( $j=1; $j<$seqlg; $j++ ) {
	    for( $i=0; $i<$seqlg-$j; $i++ ) {

		# rules S->LS & F->LS (standard CYK-algorithm)
		for( $k=1; $k<$j+1; $k++ ) {

		    next if( $T[$i][$k-1][2][0]==$INFINITE || $T[$i+$k][$j-$k][0][0]==$INFINITE );

		    # T[$i,$j][S] := max{ T[$i,$j][S], Pr_{rule}[S->LS] x T[$i,$k-1][L] x T[$i+$k,$j-$k][S] }
		    $pr = add(add($scfg_haha->{'S'}{'LS'},$T[$i][$k-1][2][0]),$T[$i+$k][$j-$k][0][0]);
		    if( max($pr,$T[$i][$j][0][0]) ) {
		    	$T[$i][$j][0][0] = $pr;
			$T[$i][$j][0][1] = \$T[$i][$k-1][2];
			$T[$i][$j][0][2] = \$T[$i+$k][$j-$k][0];
		    }		    
		    # T[$i,$j][F] := max{ T[$i,$j][F], Pr_{rule}[F->LS] x T[$i,$k-1][L] x T[$i+$k,$j-$k][S] }
		    $pr = add(add($scfg_haha->{'F'}{'LS'},$T[$i][$k-1][2][0]),$T[$i+$k][$j-$k][0][0]);
                    if( max($pr,$T[$i][$j][1][0]) ) {
		    	$T[$i][$j][1][0] = $pr;
			$T[$i][$j][1][1] = \$T[$i][$k-1][2];
			$T[$i][$j][1][2] = \$T[$i+$k][$j-$k][0];
		    }
		}

		# rule F->dFd (inner bond)
		# loop consists at least 3 unpaired bases
		if( $j>3 ) {

		    if( ($consstr_ve->[$i]==$INFINITE && $consstr_ve->[$i+$j]==$INFINITE) || ($T[$i+1][$j-2][1][0]!=$INFINITE && $consstr_ve->[$i]==$consstr_ve->[$i+$j] && $consstr_ve->[$i]!=-1) ) {
		    
			# T[$i,$j][F] := max{ T[$i,$j][F], Pr_{rule}[F->dFd] x $T[$i+1][$j-2][F] x Pr_{paired}[\vec{A}^i\vec{A}^{$i+$j}|T] x 
			# prod_{u=0}^{n-1} { Pr[(A^$i_u,A^{$i+$j}_u)|(A^{$i-1},$i+$j+1),s_u] if paired, 
			#                    Pr_{unpaired}[A^i_u|s_u] x Pr_{unpaired}[A^{$i+$j}_u|s_u] if unpaired } }
			$pr = add(add($scfg_haha->{'F'}{'dFd'},$T[$i+1][$j-2][1][0]),2*$paired_pet_ma->[$i][$i+$j]);
			if( max($pr,$T[$i][$j][1][0]) ) {
			    $T[$i][$j][1][0] = $pr;
			    $T[$i][$j][1][1] = \$T[$i+1][$j-2][1];
			    $T[$i][$j][1][2] = $INFINITE;
			}
		    }
		}

		# rule L->dFd (open bond)
		# loop consists at least 3 unpaired bases
		if( $j>3 ) {

		    if( ($consstr_ve->[$i]==$INFINITE && $consstr_ve->[$i+$j]==$INFINITE) || ($T[$i+1][$j-2][1][0]!=$INFINITE && $consstr_ve->[$i]==$consstr_ve->[$i+$j] && $consstr_ve->[$i]!=-1) ) {
			
			# T[$i,$j][L] := max{ T[$i,$j][L], Pr_{rule}[L->dFd] x $T[$i+1][$j-2][F] x Pr_{paired}[\vec{A}^i\vec{A}^{$i+$j}|T] x 
			# prod_{u=0}^{n-1} { Pr[(A^$i_u,A^{$i+$j}_u)|s_u] if paired, 
			#                    Pr_{unpaired}[A^i_u|s_u] x Pr_{unpaired}[A^{$i+$j}_u|s_u] if unpaired } }
			$pr = add(add($scfg_haha->{'L'}{'dFd'},$T[$i+1][$j-2][1][0]),2*$paired_pet_ma->[$i][$i+$j]);
			if( max($pr,$T[$i][$j][2][0]) ) {
			    $T[$i][$j][2][0] = $pr;
			    $T[$i][$j][2][1] = \$T[$i+1][$j-2][1];
			    $T[$i][$j][2][2] = $INFINITE;
			}
		    }
		}
		
		# rule S->L
		# T[$i,$j][S] := max{ T[$i,$j][S], Pr_{rule}[S->L] x T[$i,$j][L] }
		if( $T[$i][$j][2][0]!=$INFINITE ) {
		    $pr = add($scfg_haha->{'S'}{'L'},$T[$i][$j][2][0]);
		    if( max($pr,$T[$i][$j][0][0]) ) {
			$T[$i][$j][0][0] = $pr;
			$T[$i][$j][0][1] = \$T[$i][$j][2];
			$T[$i][$j][0][2] = $INFINITE;
		    }
		}
	    }
	}

	return \@T;
}

# to return maximum of values considering infinite values
# true if first value is maximum, else false
sub max {
    my ($x,$y) = @_;
    my $max;

    if( $x!=$INFINITE && $y!=$INFINITE ) {
        $max = ($x>=$y) ? 1 : 0;
    }
    elsif( $y==$INFINITE ) {
        $max = 1;
    }
    else {
        $max = 0;
    }

    return $max;
}

# to return summand  of values considering infinite values
sub add {
    my ($x,$y) = @_;

    ( $x!=$INFINITE && $y!=$INFINITE ) ? return $x+$y : return $INFINITE;
}

# backtracking through the CYK-table from T[0,$seqlength-1][S] to T[0..$seqlength-1,0][L]
# to get the consensus secondary structure of the alignment
sub backtracking {
    my ($T_ref) = @_;
    	
    my $ss = 'S';
    my @rid = (0); 
    my @depth = (0);
    my $depth;

    # to get CYK-table entries of childs of rule S in T[0][n-1] 
    my @c1 = ( $T_ref->[0][$#{$T_ref->[0]}][0][1] );
    my @c2 = ( $T_ref->[0][$#{$T_ref->[0]}][0][2] );   

    while(1) {
	# leave the loop if all non-terminals are replaced by terminals
	last unless $ss=~/[SLF]/;

	# actual Non-terminal is S
	if( $rid[$#rid]==0 ) {
		# rule S->LS
		if( $c2[$#c2] != $INFINITE ) {
			$ss =~ s/S/LS/;

			# S rule
			push @rid, 0;
			push @c1, ${$c2[$#c2]}->[1];
			push @c2, ${$c2[$#c2]}->[2];

			# L rule
			push @rid, 2;
			push @c2, ${$c1[$#c1-1]}->[2];
			push @c1, ${$c1[$#c1-1]}->[1];
			$depth[$#depth]++;
			push @depth, 1;
			next;
		}
		# rule S->L
		else {
			$ss =~ s/S/L/;

			push @rid, 2;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
	}
	# actual Non-terminal is F
	elsif( $rid[$#rid]==1 ) {
		# rule F->LS
		if( $c2[$#c2] != $INFINITE ) {
			$ss =~ s/F/LS/;
	
			# S rule
			push @rid, 0;
			push @c1, ${$c2[$#c2]}->[1];
			push @c2, ${$c2[$#c2]}->[2];

			# L rule
			push @rid, 2;
			push @c2, ${$c1[$#c1-1]}->[2];
			push @c1, ${$c1[$#c1-1]}->[1];
			$depth[$#depth]++;
			push @depth, 1;
			next;
		}
		# rule F->dFd
		else {
			$ss =~ s/F/\(F\)/;

			push @rid, 1;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
    	}
	# actual Non-terminal is L
	else {
		# rule L->dFd
		if( $c1[$#c1] != $INFINITE ) {
			$ss =~ s/L/\(F\)/;

			push @rid, 1;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
		# rule L->s
		else {
			$ss =~ s/L/\./;
			$depth = pop @depth;
			for(my $i=0; $i<$depth; $i++) {
				pop @c1;
				pop @c2;
				pop @rid;
			}
			next;
		}
    	}
    }

    return $ss;
}


# returns a constraint string usable with 'RNAfold -C'
sub get_constraint_string
{
    my ($bp_left_ve,$bp_right_ve,$ss_ve,$length) = @_;
    my @string;
    my $j=0;
    my $string = "";

    for( my $i=0; $i<$length; $i++ ) {
	push @string,".";
    }

    map {$string[$_] = "<"} @$bp_left_ve;
    map {$string[$_] = ">"} @$bp_right_ve;
    map {$string[$_] = "x"} @$ss_ve;

    map {$string .= $_} @string;
    
    return $string;
}

# calculates single-stranded probabilities of the PETfold model
sub get_pet_single
{
    my ($single_tree_ve, $single_seq_ma) = @_;
    my (@P, $prod);
    my $seqlg = $#{$single_tree_ve}+1;

    for(my $i=0; $i<$seqlg; $i++) {
    	$P[$i] = add($single_tree_ve->[$i], $BETA * $$single_seq_ma[$i]) / 2;
    }

    return \@P;
}

# calculates base-paired probabilities of the PETfold model
sub get_pet_paired
{
    my ($paired_tree_ma, $paired_seq_vema) = @_;
    my (@P, $prod, $i, $j);
    my $seqlg = $#{$paired_tree_ma}+1;

    for( $i=0; $i<$seqlg; $i++ ) {
	for( $j=0; $j<$seqlg; $j++) {
		$P[$i][$j] = $INFINITE;
	}
    } 	

    for( $j=1; $j<$seqlg; $j++ ) {
    	for( $i=0; $i<$seqlg-$j; $i++ ) {
		$P[$i][$i+$j] = add($paired_tree_ma->[$i][$i+$j], $BETA * $paired_seq_vema->[$i][$i+$j]) / 2;
 	}
    }	

    return \@P;
}

# print the probabilities in a file that can be printed as dotplot using 'drawplot' from Bjarne Knudsen
sub create_ppfile{
    my ($paired_ma, $single_ve, $ppfile) = @_;
    my $seqlg = $#{$single_ve}+1;

    open(FILE, ">$ppfile") || die("Can not open the file!\n");
    print FILE $seqlg . "\n";
    for(my $i=0; $i<$seqlg; $i++){
    	for(my $j=0; $j<$seqlg; $j++){
		if( $paired_ma->[$i][$j] == $INFINITE ) {
			if( $paired_ma->[$j][$i] == $INFINITE ) {
				print FILE "0\t";
			}
			else {
				print FILE $paired_ma->[$j][$i] . "\t";
			}
		}
		else {
			print FILE $paired_ma->[$i][$j] . "\t";
		}
	}
	print FILE "\n";
    }
    print FILE "\n";
    map { print FILE "$_\t" } @{$single_ve};    
    close(FILE);

    return 1;
}

sub numerically { $a <=> $b }

sub usage
{
   die("PETfold v1.0\n" .
       "============\n" .
       "by Stefan E Seemann (seemann\@genome.ku.dk)\n" .
       "\n" .	
       "Usage: PETfold.pl --fasta <file> [ options ] [ parameter settings ]\n" .
       "\n" .
       "   --fasta <file>               ... alignment in fasta format\n" .
       "Options:\n" .
       "   --setstruct <structure>      ... calculates score for given structure in dot-bracket notation\n" .
       "   --settree <tree>		... calculates score for given tree in Newick tree format\n" .
       "   --war			... fasta format output\n" .
       "   --ppfile <file>		... writes PET probabilities in pp-file\n" .
       "                                    which can be drawn (as ps) using 'pfold/drawdot'\n" .
        "Parameter settings:\n" .    
       "   --setevocon_bp <reliability> ... rel.threshold for conserved basepairs (default: 0.9)\n" .
       "   --setevocon_ss <reliability> ... rel.threshold for conserved single stranded pos. (default: 1)\n" .
       "   --setalpha <nr>              ... weighting factor for single stranded probs (default: 0.2)\n" .
       "   --setbeta <nr>               ... weighting factor for thermodynamic overlap (default: 1)\n" .
       "   --setgap <nr>                ... max. percent of gaps in alignment column (default:0.25)\n\n");  
}

sub usage1
{
   die("Have you forgotten to set the right environmental variables for RNAfold as well as Pfold?\n" .
       "\n" .
       "In a bash the commands would be:\n" .
       "\$ export RNAfold_bin='<path_to_RNAfold>'\n" .
       "\$ export Pfold_bin='<path_to_Pfold>'\n\n");
}

sub usage2
{
   die("Input FASTA-file does not exist!\n\n");
}

