#!/usr/bin/perl -w
# Copyright (C) 2008 Stefan E Seemann <seemann@genome.ku.dk>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# Version: v3.1.2
# Created: 2008-08-05
# Modified: 2010-08-12
# NEW: setstruct options and checkTree; adapted scoring - constraint bases of sequence x are multiplied by constraint structure probability of sequence x (instead of geometric mean of both constraint structure probabilities); extended stems have to have partial structure probability greater than gamma (partprob); introduce SETTREE; delete      SETSTRUCT

use Getopt::Long;
use strict;

#EXTERNAL PROGRAMS
my $RNAfold = $ENV{'RNAfold_bin'};   # package ViennaRNA
my $Pfold = $ENV{'Pfold_bin'};   # Pfold

&usage() if $#ARGV<2;
&usage1() if ! -e "$RNAfold/RNAfold";
&usage1() if ! -e "$Pfold/fasta2col";

#CONSTANTS
my $KT = 0.616320775;
my $EVOCON_BP = 0.9;   # minimal base pair reliability for evolutionary constraints
my $EVOCON_SS = 1;     # minimal single stranded reliablity for evolutionary constraints
my $ALPHA = 0.2;       # weighting factor for single stranded probabilities (alpha <= 0.5)
my $BETA = 1;          # weighting factor for thermodynamic overlap
my $BETADUPLEX = 1;    # weighting factor for thermodynamic overlap in RNA duplex folding - step 2
my $GAP = 0.25;        # maximal allowed percent of gaps in an alignment column; deletion of column if rate is equal or higher
my $PETCON = 0.9;      # threshold for PET-model constraints to be free for RNA dublex: maximal allowed intramolecular base-paired reliability 
my $PARTPROB = 0.1;    # threshold for minimal probability of single partial structure
 
my $INFINITE = 1000;
my $SETRULES = 0;      # usage of rule probabilities of the SCFG
my $SETEVO = 1;	       # usage of evolutionary reliabilities (performed by Pfold)
my $SETTHERMO = 1;     # usage of thermodynamic probabilities (performed by RNAfold)
my $PARTPROB_AND = 0;  # thermodynamic AND evolutionary partial structure probability have to be larger as PARTPROB; by default OR 
my $INTERMOL = 0;      # structure output of intermolecular base pairs
my $WAR = 0;	       # fasta format output
my $PPFILE = 0;	       # name of pp-file
my $MRNANOTCONSTR = 0; # mRNA (second sequence by convention) is constrained by default
my $NOLP = 0;	       # RNA(co)fold option: disallows pairs that can only occur isolated
my $EXTSTEM = 0;       # constraint stems get extended by basepairs if the average reliability of the extended stem is larger than PETCON
my $SETTREE = 0;       # given phylogenetic tree in Newick tree format
my $SETSTRUCT1 = 0;    # given secondary structure of first alignment in dot-bracket notation 
my $SETSTRUCT2 = 0;    # given secondary structure of second alignment in dot-bracket notation 
my @fasta;

GetOptions('fasta=s' => \@fasta,
	   'setrules' => \$SETRULES,
	   'setevo' => \$SETEVO,
	   'setthermo' => \$SETTHERMO,
	   'setevocon_bp=s' => \$EVOCON_BP,
	   'setevocon_ss=s' => \$EVOCON_SS,
	   'setalpha=s' => \$ALPHA,
	   'setbeta=s' => \$BETA,
	   'setbetaduplex=s' => \$BETADUPLEX,
	   'setgap=s' => \$GAP,
	   'setpetcon=s' => \$PETCON,
	   'setpartprob=s' => \$PARTPROB,
	   'partprob_and' => \$PARTPROB_AND,
	   'settree=s' => \$SETTREE,
	   'setstruct1=s' => \$SETSTRUCT1,
   	   'setstruct2=s' => \$SETSTRUCT2,
   	   'intermol' => \$INTERMOL,
           'war' => \$WAR,
	   'ppfile=s' => \$PPFILE,
   	   'mrnanotconstr' => \$MRNANOTCONSTR,
   	   'noLP' => \$NOLP,
   	   'extstem' => \$EXTSTEM);

&usage() if $#fasta!=1;

# check if input files exist
foreach my $i (0, 1) {
	if(! -e $fasta[$i]) {
		die "Oops! A file called $fasta[$i] does not exist.\n";
	}
}

# create unique ID for temporary files
my $ID = &getUniqueID();

$NOLP = ($NOLP) ? "-noLP" : "";

my %bp = (
	  'GU' => 1, 'UG' => 1,
	  'AU' => 1, 'UA' => 1,
	  'GC' => 1, 'CG' => 1
	  );

my %scfg = ($SETRULES) ? (
			  'S' => { 'LS' => 0.868534, 'L' => 0.131466 },
			  'F' => { 'dFd' => 0.787640, 'LS' => 0.212360 },
			  'L' => { 's' => 0.894603, 'dFd' => 0.105397 }
			  ) :
                          (
			  'S' => { 'LS' => 0, 'L' => 0 },
			  'F' => { 'dFd' => 0, 'LS' => 0 },
			  'L' => { 's' => 0, 'dFd' => 0 }
			  );

###############################
# MAIN          #########
###################

my @SETSTRUCT = ($SETSTRUCT1, $SETSTRUCT2);
$SETTREE = read_file($SETTREE) if -e $SETTREE;

my $align1_ref = get_alignment($fasta[0]);
my $align2_ref = get_alignment($fasta[1]);

my ($seqname_ref, $align_seq1_ref, $align_seq2_ref) = compare_arrays($align1_ref,$align2_ref);
&checkTree($SETTREE, $seqname_ref) if $SETTREE;

# PETfold for single sequences
my (@align_new, @align_len, @gap_col_ref, @evocon, $evocon, @rel_intra, @single_pet_ref, @paired_pet_ref, @ss_sgl, @partial_bp_open, @partial_bp_close, @evolpartprob, @thermopartprob, $sglthermopartprob, $sglevolpartprob, $totprob, $mypetcon, $ss_sgl, $open_ref, $close_ref);

my $seqnr=1;
for my $align_seq_ref ($align_seq1_ref, $align_seq2_ref) {

  print "PETfold on alignment $seqnr:\n";

  my ($align_new_ref, $gap_col_ref) = delete_gap_columns($align_seq_ref);
  my $seqlg=$#{$align_new_ref->[0]}+1;

  my ($paired_tree_ref,$single_tree_ref,$struct,$totprob) = get_prob_tree($align_new_ref,$seqname_ref,$SETTREE,$SETEVO);

  my ($evocon_bp_left_ref,$evocon_bp_right_ref,$evocon_ss_ref) = get_constr_tree($paired_tree_ref,$single_tree_ref,$struct);
  $evocon = "." x $seqlg;
  $evocon = get_constraint_string($evocon_bp_left_ref,$evocon_bp_right_ref,$evocon_ss_ref,$seqlg) if $SETEVO;

  my $paired_seq_ref = get_prob_paired_seq($align_new_ref,$evocon,$SETTHERMO); 
  my $single_seq_ref = get_prob_unpaired_seq($paired_seq_ref,$SETTHERMO);

  my $single_pet_ref = get_pet_single($single_tree_ref,$single_seq_ref);
  my $paired_pet_ref = get_pet_paired($paired_tree_ref,$paired_seq_ref);
  #print "single_pet_ref:\n";map{print $_."\t"}@{$single_pet_ref};print "\n";
  #print "paired_pet_ref:\n";map{map{print $_."\t"}@$_;print "\n"}@{$paired_pet_ref[$#paired_pet_ref]};print "\n";

  # calculation of the intramolecular PET_reliability
  $SETSTRUCT[$seqnr-1] = read_file($SETSTRUCT[$seqnr-1]) if -e $SETSTRUCT[$seqnr-1];
  $evocon = mod_setstruct($SETSTRUCT[$seqnr-1], $gap_col_ref) if $SETSTRUCT[$seqnr-1];

  my $consstr_ref = test_consstr($evocon,$seqlg);
  my $T_ref = cyk(\%scfg,$paired_pet_ref,$single_pet_ref,$consstr_ref); 
  my $rel_intra = $T_ref->[0][$seqlg-1][0][0]/$seqlg;
	
  # get MFE structure
  if( $T_ref->[0][$seqlg-1][0][0]!=$INFINITE ) {
  	$ss_sgl = backtracking($T_ref);
  }

  $mypetcon = $PETCON;
  while( 1 ) {
	# mRNA is not constrained if $MRNANOTCONSTR is set
	if( $seqnr==2 && $MRNANOTCONSTR ) {
		$open_ref = [];
		$close_ref = [];
		$evocon = "." x $seqlg;
		$sglthermopartprob = 1;
		$sglevolpartprob = 1;	
	}
	else {
        	# find basepairs in the MFE structure that should be constraint as intramolecular basepairs -> partial structure 
        	# selection of indizes of basepairs in the MFE structure with large intramolecular basepaired reliability
		($open_ref, $close_ref) = get_partial_struct($paired_pet_ref, $ss_sgl, $mypetcon);	

		# calculate probability of partial structure compared to all possible structures
		$evocon = get_constraint_string($open_ref,$close_ref,[],$seqlg);
		# thermodynamic part
		$sglthermopartprob = get_fold_thermo_partial_prob($align_new_ref,$evocon);
		# evolutionary part
		$sglevolpartprob = get_fold_evol_partial_prob($align_new_ref,$evocon);
	}
	print "petcon = $mypetcon; thermodynamic partial structure probability = $sglthermopartprob; evolutionary partial structure probability = $sglevolpartprob\n";

	# test threshold of partial partition function
	# if test is negative then repeate procedure with increased max. intramol. base-paired rel. of bases to be free for dublex binding (PETCON)
	if( (!$PARTPROB_AND && ($sglthermopartprob >= $PARTPROB || $sglevolpartprob >= $PARTPROB || $mypetcon >= 1 )) ||
	($PARTPROB_AND && (($sglthermopartprob >= $PARTPROB && $sglevolpartprob >= $PARTPROB) || $mypetcon >= 1) ) )
	{
		push @thermopartprob, $sglthermopartprob;
		push @evolpartprob, $sglevolpartprob;
                push @ss_sgl, include_gap_columns($ss_sgl,$gap_col_ref);
		push @align_new, $align_new_ref;
                push @align_len, $seqlg;
		push @gap_col_ref, $gap_col_ref;  
		push @single_pet_ref, $single_pet_ref; 
		push @paired_pet_ref, $paired_pet_ref;
                push @evocon, $evocon;
		push @rel_intra, $rel_intra;
		push @partial_bp_open, $open_ref;
		push @partial_bp_close, $close_ref;

		my $evocon_gap = include_gap_columns($evocon, $gap_col_ref);
		print "Partial structure $seqnr: $evocon_gap\n";
		$seqnr++;
 		last;
	}
	else {
		$mypetcon += ($mypetcon>=0.9) ? 0.02 : 0.1;
	}
  }

}
# check if any base is free for intermolecular binding
my $nrconstr = $#{$partial_bp_open[0]} + $#{$partial_bp_close[0]} + $#{$partial_bp_open[1]} + $#{$partial_bp_close[1]} + 4;
die("No base is free for intermolecular binding!\n") if $align_len[0] + $align_len[1] - $nrconstr == 0;
#create_ppfile($paired_pet_ref[0],$single_pet_ref[0],"seq1.pp") if $PPFILE;
#create_ppfile($paired_pet_ref[1],$single_pet_ref[1],"seq2.pp") if $PPFILE;

# set BETA to BETADUPLEX
$BETA = $BETADUPLEX;

# PETcofold for RNA binding
my ($cofold_paired_tree_ref, $cofold_single_tree_ref, $struct, $cofold_paired_seq_ref, $cofold_single_seq_ref, $cofold_single_pet_ref, $cofold_paired_pet_ref, $duplg, $consstr_ref, $T_ref, $ss_duplex, $ss_sgl1,$ss_sgl2, $ss_inter_ref, $ss_inter);

# update partial structure
$evocon[0] = get_constraint_string([],[],[@{$partial_bp_open[0]},@{$partial_bp_close[0]}],$align_len[0]);
if( !$MRNANOTCONSTR ) {
	# mRNA is constrained by default
	$evocon[1] = get_constraint_string([],[],[@{$partial_bp_open[1]},@{$partial_bp_close[1]}],$align_len[1]);       
}
#print "RNAcofold constraint: " . include_gap_columns($evocon[0], $gap_col_ref[0]) . "&" . include_gap_columns($evocon[1], $gap_col_ref[1]) . "\n";

# evolutionary scoring adapts dublex reliabilities by evol. probability of partial structure (intramolecular constrained structure)
($cofold_paired_tree_ref, $cofold_single_tree_ref, $struct) = get_cofold_prob_tree(\@align_new,$seqname_ref,\@evocon,$SETTREE,$SETEVO);
$cofold_paired_tree_ref = multiply_partprob_paired($cofold_paired_tree_ref,\@evolpartprob);
$cofold_single_tree_ref = multiply_partprob_single($cofold_single_tree_ref,\@evolpartprob);
#print "cofold_paired_tree_ref:\n";map{map{print $_."\t"}@$_;print "\n"}@$cofold_paired_tree_ref;print "\n";
#print "cofold_single_tree_ref:\n";map{print $_."\t"}@$cofold_single_tree_ref;print "\n"; 

# thermodynamic scoring adapts dublex reliabilities by thermo. probability of partial structure (intramolecular constrained structure) 
$cofold_paired_seq_ref = get_cofold_prob_paired_seq(\@align_new,\@evocon,$SETTHERMO);
$cofold_paired_seq_ref = multiply_partprob_paired($cofold_paired_seq_ref,\@thermopartprob);
$cofold_single_seq_ref = get_prob_unpaired_seq($cofold_paired_seq_ref,$SETTHERMO);
$cofold_single_seq_ref = multiply_partprob_single($cofold_single_seq_ref,\@thermopartprob);
#print "cofold_single_thermo_ref:\n";map{print $_."\t"}@$cofold_single_seq_ref;print "\n";
$cofold_single_pet_ref = get_pet_single($cofold_single_tree_ref,$cofold_single_seq_ref);
$cofold_paired_pet_ref = get_pet_paired($cofold_paired_tree_ref,$cofold_paired_seq_ref);

# merging of intramolecular reliabilities for bases with large intramolecular basepaired reliabilities
# and intermolecular reliabilities for the other bases
$cofold_single_pet_ref = merge_single_fold_cofold_pet($cofold_single_pet_ref,\@single_pet_ref,[@partial_bp_open, @partial_bp_close]);
$cofold_paired_pet_ref = merge_paired_fold_cofold_pet($cofold_paired_pet_ref,\@paired_pet_ref,\@thermopartprob,\@evolpartprob,\@partial_bp_open, \@partial_bp_close);
#print "merged_cofold_paired_pet_ref:\n";map{map{print $_."\t"}@$_;print "\n"}@$cofold_paired_pet_ref;print "\n";
#map{print $_."\t"}@$cofold_single_pet_ref;print "\n";

# create constraint string for dublex folding considering intramolecular basepairs and linker as single-stranded
$evocon = $evocon[0] . "xxx" . $evocon[1];
$duplg = $align_len[0]+$align_len[1]+3;
$consstr_ref = test_consstr($evocon,$duplg);
#print "evocon:\t$evocon\n";
#print "constraint:\n";map{print "$_\t"}@$consstr_ref;print "\n";

# MEA-CYK to find the most reliable RNA duplex structure
$T_ref = cyk_cofold(\@align_new,\%scfg,$cofold_paired_pet_ref,$cofold_single_pet_ref,$consstr_ref);
#print "T:\n";my $i=-1;map { map { map { print "$_->[0]\t" } @{$_}; print "\n" } @{$_}; $i++; print "this was i=$i\n" } @{$T_ref};print "\n";

if( $T_ref->[0][$duplg-1][0][0]!=$INFINITE ) {
	$ss_duplex = backtracking($T_ref);
	$ss_sgl1 = substr($ss_duplex, 0, $align_len[0]);
	$ss_sgl2 = substr($ss_duplex, $align_len[0]+3); 
}

# get non-constrained base-pairs for the final structure prediction
$EXTSTEM = 0;
($open_ref, $close_ref) = get_partial_struct($cofold_paired_pet_ref, ($ss_sgl1 . $ss_sgl2), 0);

# generate duplex structure string
map{ $ss_sgl1=substr($ss_sgl1,0,$_)."{".substr($ss_sgl1,$_+1) } @{$partial_bp_open[0]};
map{ $ss_sgl1=substr($ss_sgl1,0,$_)."}".substr($ss_sgl1,$_+1) } @{$partial_bp_close[0]};
map{ $ss_sgl2=substr($ss_sgl2,0,$_)."{".substr($ss_sgl2,$_+1) } @{$partial_bp_open[1]};
map{ $ss_sgl2=substr($ss_sgl2,0,$_)."}".substr($ss_sgl2,$_+1) } @{$partial_bp_close[1]};
$ss_sgl1 = include_gap_columns($ss_sgl1,$gap_col_ref[0]);
$ss_sgl2 = include_gap_columns($ss_sgl2,$gap_col_ref[1]);
$ss_duplex = $ss_sgl1 . "&" . $ss_sgl2;

# get interaction sites
$ss_inter_ref = get_intermol_struct($open_ref,$close_ref,$align_len[0],$align_len[1]);
$$ss_inter_ref[0] = include_gap_columns($$ss_inter_ref[0],$gap_col_ref[0]);
$$ss_inter_ref[1] = include_gap_columns($$ss_inter_ref[1],$gap_col_ref[1]);
$ss_inter = $$ss_inter_ref[0] . "&" . $$ss_inter_ref[1];

foreach my $b ( ("(",")") ) {
	my $i = index($ss_inter,$b);
	while( $i+1 ) {
		if( $b eq "(" ) {substr($ss_duplex,$i,1,"[")}
		else {substr($ss_duplex,$i,1,"]")}
		$i = index($ss_inter,$b,$i+1);
	}
}

# ###############################
# OUTPUT        #########
###################

create_ppfile($cofold_paired_pet_ref,$cofold_single_pet_ref,$PPFILE) if $PPFILE;

my $score = get_score($ss_sgl1.$ss_sgl2, $cofold_paired_pet_ref,$cofold_single_pet_ref); 

if( $WAR ) {
	for (my $i=0; $i<$#{$align_seq1_ref}+1; $i++) {
    		print ">" . ${$seqname_ref}[$i] . "\n";
		map { print "$_" } @{${$align_seq1_ref}[$i]}; print "&";
		map { print "$_" } @{${$align_seq2_ref}[$i]}; print "\n";
	}
    	print ">structure\n";
	$ss_duplex = $ss_inter if $INTERMOL;
        $ss_duplex =~ s/-/\./g;
        print $ss_duplex."\n";
}
else {
# 	if( $SETEVO ) {
#		print "Pfold RNA sec.struct.:\t\t".include_gap_columns($struct,$gap_col_ref)."\n";
#		print "Constraints:\t\t\t".include_gap_columns($evocon,$gap_col_ref)."\n";
#    	}
	print "Input Files:\t$fasta[0] $fasta[1]\n"; 
	print "Common Identifiers:\t"; map{ print $_.", " } @$seqname_ref; print "\n";
	print "PETfold RNA sec.struct.:\t$ss_sgl[0] $ss_sgl[1]\n";
    	print "PETcofold RNA sec.struct.:\t$ss_duplex\n" if $T_ref->[0][$duplg-1][0][0]!=$INFINITE;
	print "Intermolecular RNA sec.struct.:\t$ss_inter\n" if $INTERMOL;
    	print "Score_{model,structure}(tree,alignment) = $score\n";
    	# Reliability_inter = Score/{sequence length without gaps}
    	my $rel_inter = $score/($duplg-3);
    	print "Reliability_{model,structure}(tree,alignment) = $rel_inter\n";
	# calculation of the intermolecular PET_reliability
	# delta Reliability binding = Reliab_inter - 1/2*(Reliab_intra(seq1)+Reliab_intra(seq2))
	print "delta Reliability binding = " . ($rel_inter - 1/2*($rel_intra[0]+$rel_intra[1])) . "\n";
}

###############################
# FUNCTIONS     #########
###################

sub map_constraints
{
    my ($constr_ve, $length1) = @_;
    my (@constr1, @constr2, @duplex_constr, $i);

    foreach $i ( @$constr_ve ) {
        if( $i < $length1 ) {
                my $k = $i;
                push @constr1, $k;
        }
        else {
                my $k = $i - $length1;
                push @constr2, $k;
        }
    }
    push @duplex_constr, \@constr1;
    push @duplex_constr, \@constr2;

    return \@duplex_constr;
}

# to test the correctness of the input consensus structure and sign the related bases of all basepairs
sub test_consstr
{
    my ($consstr,$align_length) = @_;
    my @ss;

    my $bp = -1;
    my @consstr = split "",$consstr;

    die("Consensus structure invalid: wrong length!\n") if $#consstr!=$align_length-1;

    for(my $i=0; $i<$#consstr+1; $i++) {
	if( $consstr[$i] eq '<' ) {
	    push @ss, ++$bp;
	}
	elsif( $consstr[$i] eq '>' ) {
	    push @ss, $bp--;
	}
	elsif( $consstr[$i] eq 'x' ) {
	    push @ss, -1;
	}
	else{
	    push @ss, $INFINITE;
	}
    }

    die("Consensus structure invalid: number of left and right parenthesis unequal!\n") if $bp!=-1;

    return \@ss;
}

# to extract paired and single probabilities of tree
sub get_prob_tree
{
	my ($align_ma,$org_ve,$SETTREE,$SETEVO) = @_;
	my ($l,$i,$j,$line,@paired,@single,$totprob);
	my $struct = "";

	if($SETEVO) {
	    # write fasta file
	    my ($fafile, $seqname_ve) = get_fasta_file($align_ma, $org_ve);
	    
	    # to run pfold to get the phylogenetical probabilities of the input alignment 
	    my $colfile = "t$ID.col";
	    # to change the format
	    system "$Pfold/fasta2col $fafile | sed 's/arbitrary/RNA/g' > $colfile";
	    # to find the phylogeny of the sequences using the neighbour joining approach
	    # to estimate maximum likelihood of the branch lengths
	    # to calculate pairwise and single probabilities <depending on the tree using Felsensteins dynamic programming for phylogenetic trees
	    my $ppfile = "t$ID.pp";
	    my $pfoldout = "t$ID.out";
	    my $mytree = "t2$ID.col";
	    my $mytree_ml = "t3$ID.col";
	    # phylogenetical tree is given
    	    if($SETTREE) {
		system "echo \"$SETTREE:0\" | $Pfold/newick2col > $mytree";
		# branch lengths are not given and are estimated by maximum likelihood
		if(index($SETTREE,":")==-1) {
			system "cat $colfile | $Pfold/nohead >> $mytree";
			system "$Pfold/mltree $Pfold/scfg.rate $mytree 2> /dev/null > $mytree_ml";
			system "$Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm $mytree_ml 2> /dev/null > $pfoldout";
		}
		# branch lengths are given
		else {
			#system "$Pfold/drawphyl --mult=5 $mytree > tree2.ps";
			system "$Pfold/scfg --treefile --ppfile $mytree $ppfile $Pfold/article.grm $colfile > $pfoldout";
		}
	    }
	    # phylogenetical tree is not given and is estimated by neighbour joining approach
	    else {
	    	system "$Pfold/findphyl $Pfold/scfg.rate $colfile | $Pfold/mltree $Pfold/scfg.rate 2> /dev/null | $Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm 2> /dev/null > $pfoldout";
	    }   

	    open IN, $ppfile || die("Can not open the file!\n");
	    $l=<IN>;
	    # write paired probabilities in a 2D-array
	    for($i=0; $i<$l; $i++) {
		$line = <IN>;
		my @row = split " ", $line;
		#map { $_=log($_)/log(2) if $_!=0; $_=$INFINITE if $_==0 } @row;
		push (@paired, \@row);
	    }
	    <IN>;
	    # write single probabilities in a 1D-array
	    $line = <IN>;
	    @single = split " ", $line;
	    <IN>;
	    # write probability of all possible structures
	    $totprob = <IN>;
	    chomp($totprob);

	    close IN;

	    # write Pfold predicted RNA secondary structure in a string
	    $struct = get_pfold_sec_struct($pfoldout);

	    system "rm -f $colfile $ppfile $fafile $pfoldout";
	}
	else {
	    $l=$#{$align_ma->[0]}+1;
	    $totprob = 0;		    

	    for($i=0; $i<$l; $i++) {
		my @row;
		for($j=0; $j<$l; $j++) {
		    push @row, 0;
		}
		push (@paired, \@row);
		push (@single, 0);
	    }
	}

	return \@paired,\@single,$struct,$totprob;	
}

# to extract the Pfold predicted RNA secondary structure from the Pfold output file
sub get_pfold_sec_struct
{
    my ($pfoldout) = @_;
    my (@l);
    my $status = 0;
    my $string = "";

    open IN, $pfoldout || die("Can not open the file!\n");
    while(<IN>) {
	$status = 1 if /RNA/;
	if( /^; \*/ && $status ) {
	    return $string;
        }
	next if /^;/;
        next unless $status;
	
	@l = split " ", $_;
        if( $l[4] eq '.' ) {
	    $string .= ".";
        }
	else{
	    if( $l[3]<$l[4] ) {
		$string .= "(";
	    }
	    else{
		$string .= ")";
	    }
	}
    }
    close IN;

    return $string;
}

# to get the indizes of left and right bases of basepairs and single-stranded bases which are evolutionary highly conserved
sub get_constr_tree
{
    my ($paired_tree_ma,$single_tree_ve,$struct) = @_;
    my (@struct,$i,$l);
    my @stack = (0);

    my @bp_left = ();
    my @bp_right = ();
    my @ss = ();

    if( $struct ne "" ) {
	@struct = split "",$struct;

	for($i=0; $i<$#struct+1; $i++){
	    if( $struct[$i] eq "(" ) {
		unshift @stack, $i;
	    }
	    elsif( $struct[$i] eq ")" ) {
		$l = shift @stack;
		if( $paired_tree_ma->[$l][$i] > $EVOCON_BP ) {
		    unshift @bp_left, $l;
		    unshift @bp_right, $i;
		}
	    }
	    else{
		unshift @ss, $i if $single_tree_ve->[$i] > $EVOCON_SS;
	    }
	}
    }

    return \@bp_left,\@bp_right,\@ss;
}

# to get the indizes of single-stranded bases which are highly reliable in the PET-model
sub get_constr_single
{
    my ($single_pet_ve, $mypetcon) = @_;
    my @ss = ();

    for(my $i=0; $i<$#{$single_pet_ve}+1; $i++) {
 	unshift @ss, $i if $single_pet_ve->[$i] > $mypetcon;
    }

    return \@ss;
}

# to get the indizes of bases that are not highly reliably base-paired in the PET-model
sub get_constr_notpaired
{
    my ($paired_pet_ve, $mypetcon) = @_;
    my @ss = ();
    my $constr;

    for(my $i=0; $i<$#{$paired_pet_ve}+1; $i++) {
	$constr = 1;
	for(my $j=$i+1; $j<$#{$paired_pet_ve}+1; $j++) {
 		if( $paired_pet_ve->[$i][$j] >= $mypetcon ) {
			$constr = 0;
			last;
		}
	}
	for(my $j=0; $j<$i; $j++) {
		if( $paired_pet_ve->[$j][$i] >= $mypetcon ) {
			$constr = 0;
			last;
		}
	}
	unshift @ss, $i if $constr;
    }

    return \@ss;
}

# to get the indizes of bases that are highly reliably base-paired in the PET-model and part of the most reliable structure
# if $EXTSTEM: constraint stems get extended by inner basepairs and outer basepairs
# if the average reliability is larger than the threshold PETCON of the inner or outer extended stem, respectively
sub get_partial_struct
{
    my ($paired_pet_ve, $ss, $mypetcon) = @_;
    my (@open, @close, $constr, @index, $k, @stem_pet, $sum, $avg_stem_pet, @pre_stem_open, @pre_stem_close);

    my @struct = split "", $ss;
    for(my $i=0; $i<$#struct+1; $i++){
	if( $struct[$i] eq "(" ) {
		unshift @index, $i;
	}
	elsif( $struct[$i] eq ")" ) {
		$k = shift @index;
		if( $paired_pet_ve->[$k][$i] >= $mypetcon ) {
			unshift @open, $k;
			unshift @close, $i;
		}
	}
	next if !$EXTSTEM;
	if( $struct[$i] eq ")" ) {
		if( $paired_pet_ve->[$k][$i] >= $mypetcon ) {
			push @stem_pet, $paired_pet_ve->[$k][$i];
		}
		elsif( @stem_pet ) {
			$sum=0;
			foreach (@stem_pet) { $sum += $_ }
			#inner stem extension
			my $stem_l = @stem_pet;
			my ($inner_open_ve, $inner_close_ve) = inner_stem_extension($sum, $stem_l, \@pre_stem_open, \@pre_stem_close, $paired_pet_ve, $mypetcon);
			@pre_stem_open = ();
			@pre_stem_close = ();
			unshift @open, @$inner_open_ve;
			unshift @close, @$inner_close_ve;
			#outer stem extension
			$avg_stem_pet = ($sum+$paired_pet_ve->[$k][$i])/(@stem_pet+1);
			if( $avg_stem_pet >= $mypetcon ) {
				unshift @open, $k;
				unshift @close, $i;
				push @stem_pet, $paired_pet_ve->[$k][$i];
			}
			else {
				@stem_pet = ();	
			}
		}
		else {
			unshift @pre_stem_open, $k;
			unshift @pre_stem_close, $i;
		}
	}
	elsif( $struct[$i] ne "(" ) {
		if( @stem_pet ) {
			$sum=0;
			foreach (@stem_pet) { $sum += $_ }
			#inner stem extension
			my $stem_l = @stem_pet;
			my ($inner_open_ve, $inner_close_ve) = inner_stem_extension($sum, $stem_l, \@pre_stem_open, \@pre_stem_close, $paired_pet_ve, $mypetcon);
			@pre_stem_open = ();
			@pre_stem_close = ();
			unshift @open, @$inner_open_ve;
			unshift @close, @$inner_close_ve;

			@stem_pet = ();
		}		
	}
    }

    return (\@open, \@close);
}

# constraint stems get extended by inner basepairs if the average reliability of the extended stem is higher as the threshold PETCON
sub inner_stem_extension
{
    my ($stem_sum, $stem_l, $pre_stem_open_ve, $pre_stem_close_ve, $paired_pet_ve, $mypetcon) = @_;
    my ($avg_stem_pet, @inner_open, @inner_close);

    my $sum_pre = 0;
    if( @$pre_stem_open_ve ) {
	while( 1 ) {
		my $l = shift @$pre_stem_open_ve;
		my $h = shift @$pre_stem_close_ve;
		$sum_pre += $paired_pet_ve->[$l][$h];
		$stem_l++;
		$avg_stem_pet = ($stem_sum+$sum_pre)/$stem_l;
		#print  "inner: $avg_stem_pet\t$l\t$h\n";
		if( $avg_stem_pet >= $mypetcon ) {
			unshift @inner_open, $l;
			unshift @inner_close, $h;
		}
		else {
			last;
		}
		last if !@$pre_stem_open_ve;
	}
    }

    return (\@inner_open, \@inner_close);
}

# to extract the alignment in a matrix (lines: sequences, rows: baseposition) from fasta-file
sub get_alignment
{
	my ($fasta) = @_;
        my (@align, $seqname);
        my $seq = "";

        open IN, $fasta || die("Can not open the file!\n");
        $_=<IN>; chomp($_); s/>//; $seqname = $_; #push (@seqname, $_);
        while( <IN> ) {
                chomp($_);
                if( /^>/ ) {
                        my @row = split "", $seq;
                        push @align, [$seqname, @row];
                        $seq = "";
                        s/>//;
                        $seqname = $_; #push (@seqname, $_);
                        next;
                }
                $_=~ tr/\.gcautT/-GCAUUU/;
                $seq .= $_;
        }
        close IN;
        my @row = split "", $seq;
        push @align, [$seqname, @row]; #push (@align, \@row);

        return \@align; #return \@align, \@seqname;     
}


# to delete columns in an alignment with more or equal $GAP% of gaps (25% in Pfold)
sub delete_gap_columns
{
	my ($align_ma) = @_;
	my ($i,$j,$nr,@d,@align_ma_new,$start,$k);

	my $seqnr=$#{$align_ma}+1;
        my $seqlg=$#{$align_ma->[0]}+1;

	# to find columns where gap rate is higher and equal as $GAP%
	for( $j=0; $j<$seqlg; $j++ ) {
		$nr = 0;
		for( $i=0; $i<$seqnr; $i++ ) {
			$nr++ if $align_ma->[$i][$j] eq "-";
		}
		push @d, $j if $nr/$seqnr>=$GAP;
	}

	# to delete these columns
	for( $i=0; $i<$seqnr; $i++ ) {
		my $subseq = "";
		$start = 0;
		for( $j=0; $j<$#d+1; $j++ ) {
			for( $k=$start; $k<$d[$j]; $k++ ) {
				$subseq .= $align_ma->[$i][$k];
			}
			$start = $d[$j]+1;
		}
		for( $k=$start; $k<$seqlg; $k++ ) {
                	$subseq .= $align_ma->[$i][$k];
                }
		my @row = split "", $subseq;
		push (@align_ma_new, \@row);
	}

	return \@align_ma_new,\@d;
}

# to add gaps in a string
sub include_gap_columns
{
	my ($ss,$gap_col_ve) = @_;

	my @gaps_sorted = sort {$a<=>$b} @{$gap_col_ve};
	my @ss = split "", $ss;
	my $ss_new = "";
	my $offset = 0;

	for( my $i=0; $i<=$#ss+1; $i++ ) {
		foreach( @gaps_sorted ) {
			if( $_==$i+$offset ) {
				$ss_new .= "-";
				$offset++;
			}
		}
		$ss_new .= $ss[$i] if defined $ss[$i];
	}

	return $ss_new;
}

# to write a fasta file
sub get_fasta_file
{
	my ($align_ma,$org_ve,$evocon,$gapid2origid_ma) = @_;
	my ($i,$n,$evocon_gap_free,@name);

	my $seqnr=$#{$align_ma}+1;

	my $fname = "t$ID.fa";
	open(file_local, ">$fname");
	for( $i=0; $i<$seqnr; $i++ ) {
		$n = ($org_ve) ? $org_ve->[$i] : $ID.$i;
		push @name, $n;
		print file_local ">$n\n";
		map{ print file_local "$_" } @{$align_ma->[$i]};
		print file_local "\n";
		# include constraint string
		if( defined $evocon ) {
		    # adapt constraint string to the gap free sequence
		    $evocon_gap_free = "";
		    map { $evocon_gap_free .= substr($evocon,$_,1) } @{$gapid2origid_ma->[$i]};
		    print file_local $evocon_gap_free . "\n";
		}
	}
	close(file_local);

	return $fname,\@name;
}

# to write a cofold fasta file
sub get_cofold_fasta_file
{
	my ($fname,$align_vema,$org_ve,$connector,$evocon_ve,$gapid2origid_vema) = @_;
	my ($i,$n,$evocon_gap_free,@name);

	my $seqnr=$#{$align_vema->[0]}+1;
	open(file_local, ">$fname");
	for( $i=0; $i<$seqnr; $i++ ) {
		$n = ($org_ve) ? $org_ve->[$i] : $ID.$i;
		push @name, $n;
		print file_local ">$n\n";
		map{ print file_local "$_" } @{$align_vema->[0][$i]};
		print file_local $connector;	
		map{ print file_local "$_" } @{$align_vema->[1][$i]};
		print file_local "\n";
		# include constraint string
		if( defined $evocon_ve ) {
		    # adapt constraint string to the gap free sequence
		    $evocon_gap_free = "";
		    map { $evocon_gap_free .= substr($$evocon_ve[0],$_,1) } @{$gapid2origid_vema->[0][$i]};
		    $evocon_gap_free .= $connector;
		    map { $evocon_gap_free .= substr($$evocon_ve[1],$_,1) } @{$gapid2origid_vema->[1][$i]};
		    print file_local $evocon_gap_free . "\n";
		}
	}
	close(file_local);

	return \@name;
}
 
# to extract the alignment without gaps in a matrix and keep the index relation between original and gap free sequences
# return alignment matrix and 
# matrix reference which columns are sequences, rows are indizes of gap free sequences and entries are related indizes of original sequences
sub get_align_without_gaps
{
	my ($align_ma) = @_;
	my ($i,$j,$k,@align_gap_free,$fname,@d);
	
	my $seqnr=$#{$align_ma}+1;
        my $seqlg=$#{$align_ma->[0]}+1;	

	for( $i=0; $i<$seqnr; $i++ ) {
		my $subseq = "";
		$k = 0;
		for( $j=0; $j<$seqlg; $j++ ) {
			if( $align_ma->[$i][$j] ne "-" ) {
				$d[$i][$k++] = $j;
				$subseq .= $align_ma->[$i][$j];				
			}
		}
		my @row = split "", $subseq;
                push (@align_gap_free, \@row);
	}

	return \@align_gap_free,\@d;
}

# to extract basepair probabilities
# calculate energy distributions without gaps, use for gaps the average of probabilities in column
sub get_prob_paired_seq
{
	my ($align_ma,$evocon,$SETTHERMO) = @_;
	my (@prob,@l,$i,$j,$m,$k,$b1,$b2,$gappos,$sum,$seqgf);

	my $seqnr=$#{$align_ma}+1;
	my $seqlg=$#{$align_ma->[0]}+1;

	# to initialise 1 matrix with zero items
	# upper triangle with pairing probabilities of open bonds (RNAfold -p)
	for( my $i=0; $i<$seqlg; $i++) {
		for( my $j=0; $j<$seqlg; $j++) {
		    $prob[$i][$j] = 0;			
		}
	}

	return \@prob unless $SETTHERMO;

	# to calculate basepair probabilities: Pr[(A^i_u,A^{i+j-1}_u)|s_u]
	# write fasta file without gaps
	my ($align_gap_free_ma, $gapid2origid_ma) = get_align_without_gaps($align_ma);
	my ($fname, $seqname_ve) = get_fasta_file($align_gap_free_ma,0,$evocon,$gapid2origid_ma);

	# call RNAfold -p
	system "$RNAfold/RNAfold -p -C -d2 -noPS $NOLP < $fname > /dev/null";   # rna secondary structure not required here
	my $cmd = "cat ";
	map {$cmd.=$_."_dp.ps "} @$seqname_ve;
	system $cmd . "> t$ID.dotp.ps";

	# fill matrix with probabilities of folding energies predicted by RNAfold
	open(file_local, "t$ID.dotp.ps");
	my $start = 0;
	my $seq = -1;
	while( <file_local> ) {
	    if( /%data starts here/ ) {
		$start = 1;
		$seq++;
		next;
	    }
	    $start = 0 if /showpage/;
	    next unless $start;
	    
	    @l = split " ";
	    # fetch small bug in RNAfold
	    $seqgf=$#{$align_gap_free_ma->[$seq]}+1;
	    next if $l[0]>$seqgf || $l[0]<1 || $l[1]>$seqgf || $l[1]<1 || $l[2]==0;
	    
	    # RNAfold returns the square roots of the base pair probabilities sqrt{P(i,j)}
	    $prob[ $gapid2origid_ma->[$seq][$l[0]-1] ][ $gapid2origid_ma->[$seq][$l[1]-1] ] += $l[2]*$l[2] if $l[3]=~/ubox/;
	}
	close file_local;
	$cmd = "rm -f ";
	map {$cmd.=$_."_dp.ps "} @$seqname_ve;
	system $cmd;
	system "rm -f t$ID.dotp.ps $fname";

	# devide the sum of probabilities in matrix by the amount of sequences (without gaps)
	for( $i=0; $i<$seqlg; $i++ ) {
		for( $j=$i+1; $j<$seqlg; $j++ ) {
			# count number of sequences with at least one gap in the base pair
			$gappos = 0;
			for( $m=0; $m<$seqnr; $m++ ) {
				$gappos++ if( $align_ma->[$m][$i] eq "-" || $align_ma->[$m][$j] eq "-" ); 
			} 
			$prob[$i][$j] /= $seqnr-$gappos;
		}
	}

	return \@prob;
}

# to compute unpaired probabilities as
# Prob_unpaired(i) = 1 - SUM_j{Prob_paired(i,j)}
sub get_prob_unpaired_seq
{
	my ($paired_seq_vema,$SETTHERMO) = @_;
	my ($i,$j,$k,@prob,$sum);

	my $seqlg=$#{$paired_seq_vema->[0]}+1;

	# initialise output matrix ( column: base position )
	for( $j=0; $j<$seqlg; $j++) {
	    $prob[$j] = 0;
	}

	return \@prob unless $SETTHERMO;

	for( $i=0; $i<$seqlg; $i++) {
		$sum = 0;
		for( $j=$i+1; $j<$seqlg; $j++) {
	    		$sum += $paired_seq_vema->[$i][$j];
		}
		for( $j=0; $j<$i; $j++ ) {
	    		$sum += $paired_seq_vema->[$j][$i];
		}
		$prob[$i] = 1 - $sum;
    	}

	return \@prob;
}


# calculate probability of partial structure in the thermodynamic model for single sequence
sub get_fold_thermo_partial_prob
{
        my ($align_ma,$evocon) = @_;

        return get_thermo_partial_prob($align_ma,$evocon,1);
}

# calculate probability of partial structure in the thermodynamic model for RNA duplex
sub get_cofold_thermo_partial_prob
{
        my ($align_vema,$evocon_ve) = @_;
        my ($evocon, $seqnr, $i, @align_cofold);

        $seqnr=$#{$align_vema->[0]}+1;
        for( $i=0; $i<$seqnr; $i++ ) {
                push @align_cofold, [ @{$align_vema->[0][$i]}, "&" , @{$align_vema->[1][$i]} ];
        }

        $evocon = $$evocon_ve[0] . "&" . $$evocon_ve[1];

        return get_thermo_partial_prob(\@align_cofold,$evocon,0);
}

# calculate probability of partial structure in the thermodynamic model
sub  get_thermo_partial_prob
{
	my ($align_ma,$evocon,$type) = @_;
	my ($partprob, $partenergy, $allenergy, $i, $j);

        # specify program
        my $program = ( $type ) ? "RNAfold" : "RNAcofold";

	# number of sequences
	my $seqnr=$#{$align_ma}+1;

	# write fasta file without gaps
	my ($align_gap_free_ma, $gapid2origid_ma) = get_align_without_gaps($align_ma);
	my ($fname, $seqname_ve) = get_fasta_file($align_gap_free_ma,0,$evocon,$gapid2origid_ma);

	# fetch free energy of the ensemble of structures containing the partial structure and of the whole ensemble
	# call RNAfold [-C] -p -d2
	my $pfname = "pf$ID.txt";
	system "$RNAfold/$program -p0 -C -d2 -noPS $NOLP < $fname > pf$ID.txt";   # dot plot and rna secondary structure not required here
	system "$RNAfold/$program -p0 -d2 -noPS $NOLP < $fname >> pf$ID.txt";     # dot plot and rna secondary structure not required here

	open(file_local, "pf$ID.txt");
	$j = -1; $partenergy = 0; $allenergy = 0;
	while( <file_local> ) {
	   if( /^>/ ) {
		$i = 0;
		$j++;
	   }	
	   if( $i == 3 ) {
		$_ =~ /([-]\d+\.\d+)/;
		if( $j < $seqnr ) {	
			$partenergy += $1;
	   	}
	   	else {
			$allenergy += $1;
	   	}
	   }
	   $i++;
	}
	close file_local;

	system "rm -f $fname pf$ID.txt";

	$partprob = exp(($allenergy-$partenergy)/$KT);

	return $partprob;
}

# calculate probability of partial structure in the evolutionary model for single sequence
sub get_fold_evol_partial_prob
{
        my ($align_ma,$evocon) = @_;

        my $partprob = get_evol_partial_prob($align_ma, $evocon, 1);
        $evocon = "." x length($evocon);
        my $totprob = get_evol_partial_prob($align_ma, $evocon, 1);

        return exp($partprob-$totprob);
}

# calculate probability of partial structure in the evolutionary model for RNA duplex
sub get_cofold_evol_partial_prob
{
        my ($align_vema,$evocon_ve) = @_;
        my ($evocon,$totprob,$partprob, @align_cofold,$seqnr,$i);

        $seqnr=$#{$align_vema->[0]}+1;
        for( $i=0; $i<$seqnr; $i++ ) {
                push @align_cofold, [ @{$align_vema->[0][$i]}, "n" , "n" , "n" , @{$align_vema->[1][$i]} ];
        }
        $evocon = "." x length($$evocon_ve[0]) . "sss" . "." x length($$evocon_ve[1]);
        $totprob = get_evol_partial_prob(\@align_cofold, $evocon, 0);
        $evocon = $$evocon_ve[0] . "sss" . $$evocon_ve[1];
        $partprob = get_evol_partial_prob(\@align_cofold, $evocon, 0);

        return exp($partprob-$totprob);
}

# calculate probability of partial structure in the evolutionary model
sub get_evol_partial_prob
{
	my ($align_ma,$evocon,$grammar) = @_;
	my ($partprob);

	# usage if grammar
	my $article = ( $grammar ) ? "article.grm" : "article_p1.grm";

    	# write fasta file
	my $seqnr=$#{$align_ma}+1;
       	my $seqlg = $#{$align_ma->[0]}+1; 
	# constrained bases with large intramolecular base-paired reliability by 's'
	$evocon =~ s/</\(/g;
 	$evocon =~ s/x/\./g;
	$evocon =~ s/>/\)/g;
	my ($fname, $seqname_ve) = get_fasta_file($align_ma,0);	

	# to get constrained bases as COL-file
	my $faheader = "th$ID.fa";
	my $colheader = "th$ID.col";
	system "echo '>structure' > $faheader";
	system "echo '$evocon' >> $faheader";
	system "$Pfold/fasta2col $faheader | sed 's/arbitrary/RNA/g' > $colheader";
	# to run pfold to get the phylogenetical probabilities of the input alignment 
	my $colfile = "t$ID.col";
	# to change the format
	# to find the phylogeny of the sequences using the neighbour joining approach
	# to estimate maximum likelihood of the branch lengths
	system "$Pfold/fasta2col $fname | sed 's/arbitrary/RNA/g' | $Pfold/findphyl $Pfold/scfg.rate | $Pfold/mltree $Pfold/scfg.rate 2> /dev/null > $colfile";
	# to calculate pairwise and single probabilities depending on the tree using Felsensteins dynamic programming for phylogenetic trees
	my $ppfile = "tt$ID.pp";
	my $pfoldout = "t$ID.out";
	system "cat $colheader $colfile | $Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/$article 2> /dev/null > $pfoldout";

	open IN, $ppfile || die("Can not open the file!\n");
	my $l=<IN>;
	# write paired probabilities in a 2D-array
	for(my $i=0; $i<$l+3; $i++) {
		<IN>;
	}
	$partprob = <IN>;
	chomp($partprob);
	close IN;

    	system "rm -f $colfile $pfoldout $ppfile $faheader $colheader $fname";

	return $partprob;
}


# dublex base-paired reliabilities are adapted by the probability of partial structure (intramolecular constraint structure) 
sub multiply_partprob_paired
{
	my ($cofold_rel_ma,$partprob_ma) = @_;

	# multiply duplex probabilities (step 2) with geometric mean of partial structure probability
	my $seqlg=$#{$cofold_rel_ma->[0]}+1;
	for( my $i=0; $i<$seqlg; $i++ ) {
		for( my $j=$i+1; $j<$seqlg; $j++ ) {
			$cofold_rel_ma->[$i][$j] *= sqrt( $$partprob_ma[0]*$$partprob_ma[1] );
		}
	}	

	return $cofold_rel_ma;
}


# dublex single-stranded reliabilities are adapted by probability of partial structure (intramolecular constraint structure) 
sub multiply_partprob_single
{
	my ($cofold_rel_ve,$partprob_ma) = @_;

	# multiply duplex probabilities (step 2) with geometric mean of partial structure probability
	my $seqlg=$#$cofold_rel_ve+1;
	for( my $i=0; $i<$seqlg; $i++ ) {
		$cofold_rel_ve->[$i] *= sqrt( $$partprob_ma[0]*$$partprob_ma[1] );
	}	

	return $cofold_rel_ve;
}

# to extract basepair probabilities of RNA dublex
# calculate energy distributions without gaps, use for gaps the average of probabilities in column
sub get_cofold_prob_paired_seq
{
	my ($align_vema,$evocon_ve,$SETTHERMO) = @_;
	my (@prob, @l, $seqgf, @id, $i, $m, $j, $k, $sum, @gappos);

	my $seqnr=$#{$align_vema->[0]}+1;
	my $seqlg=$#{$align_vema->[0][0]}+$#{$align_vema->[1][0]}+2;

	# to initialise 1 matrix with zero items
	# upper triangle with pairing probabilities of open bonds (RNAfold -p)
	for( my $i=0; $i<$seqlg; $i++) {
		for( my $j=0; $j<$seqlg; $j++) {
		    $prob[$i][$j] = 0;
		}
	}

	return \@prob unless $SETTHERMO;

	# to calculate basepair probabilities: Pr[(A^i_u,A^{i+j-1}_u)|s_u]
	# write fasta file without gaps
	my ($align_gap_free_seq1_ma, $gapid2origid_seq1_ma) = get_align_without_gaps($align_vema->[0]);
	my ($align_gap_free_seq2_ma, $gapid2origid_seq2_ma) = get_align_without_gaps($align_vema->[1]);
	my @align_gap_free_vema = ($align_gap_free_seq1_ma,$align_gap_free_seq2_ma);
	my @gapid2origid_vema = ($gapid2origid_seq1_ma,$gapid2origid_seq2_ma);
	my $fname = "t$ID.fa";
	my ($seqname_ve) = get_cofold_fasta_file($fname,\@align_gap_free_vema,0,"&",$evocon_ve,\@gapid2origid_vema);

	# call RNAcofold -p
	system "$RNAfold/RNAcofold -p -C -d2 -noPS $NOLP < $fname > /dev/null"; 
	my $cmd = "cat ";
	map {$cmd.=$_."_dp.ps "} @$seqname_ve;
	system $cmd . "> t$ID.dotp.ps";

	# fill matrix with probabilities of folding energies predicted by RNAfold
	open(file_local, "t$ID.dotp.ps");
	my $start = 0;
	my $seq = -1;
	while( <file_local> ) {
	    if( /%data starts here/ ) {
		$start = 1;
		$seq++;
		next;
	    }
	    $start = 0 if /showpage/;
	    next if !$start;
	    
	    @l = split " ";
	    # fetch small bug in RNAfold
	    $seqgf=$#{$align_gap_free_vema[0][$seq]}+$#{$align_gap_free_vema[1][$seq]}+2;
	    next if $l[0]>$seqgf || $l[0]<1 || $l[1]>$seqgf || $l[1]<1 || $l[2]==0;
	    
	    # RNAcofold returns the square roots of the base pair probabilities sqrt{P(i,j)}
	    @id = ();
	    foreach my $l ( $l[0], $l[1] ) {
	    	if( $l<=$#{$gapid2origid_vema[0][$seq]}+1 ) {
			push @id, $gapid2origid_vema[0][$seq][$l-1];   
	    	}
	    	else {
			push @id, $gapid2origid_vema[1][$seq][$l-$#{$gapid2origid_vema[0][$seq]}-2] + $#{$align_vema->[0][0]}+1;	
	    	}
	    }
	    $prob[ $id[0] ][ $id[1] ] += $l[2]*$l[2] if $l[3]=~/ubox/;
	}
	close file_local;
	$cmd = "rm -f ";
	map {$cmd.=$_."_dp.ps ".$_."_ss.ps "} @$seqname_ve;
	system $cmd;
	system "rm -f t$ID.dotp.ps $fname";

	# devide the sum of probabilities in matrix by the amount of sequences (without gaps)
	my ($gappos, $posi, $posj, $seqi, $seqj);
	for( $i=0; $i<$seqlg; $i++ ) {
		for( $j=$i+1; $j<$seqlg; $j++ ) {
			# count number of sequences with at least one gap in the base pair
			$gappos = 0;
			for( $m=0; $m<$seqnr; $m++ ) {
				if( $i<$#{$align_vema->[0][0]}+1 ) {
					$posi = $i;
					$seqi = 0;
				}
				else {
					$posi = $i-$#{$align_vema->[0][0]}-1;
					$seqi = 1;
				}
				if( $j<$#{$align_vema->[0][0]}+1 ) {
					$posj = $j;
					$seqj = 0;
				}
				else {
					$posj = $j-$#{$align_vema->[0][0]}-1;
					$seqj = 1;
				}
				$gappos++ if( $align_vema->[$seqi][$m][$posi] eq "-" || $align_vema->[$seqj][$m][$posj] eq "-" );
			} 
			$prob[$i][$j] /= $seqnr-$gappos;
		}
	}

	return \@prob;
}

# evolutionary scoring of RNA duplex
# assuming one unique tree
sub get_cofold_prob_tree
{
	my ($align_vema,$org_ve,$evocon_ve,$SETTREE,$SETEVO) = @_;
	my ($l,$i,$j,$k,$line,@paired,@single,$struct,$cons,@cons);
	if($SETEVO) {
	    # write fasta file
	    my $seqnr = $#{$align_vema->[0]}+1;
            my $seqlg1 = $#{$align_vema->[0][0]}+1; 
	    my $seqlg = $#{$align_vema->[0][0]}+$#{$align_vema->[1][0]}+5 ;
	    # constrained bases with large intramolecular base-paired reliability by 's'
	    $cons = "." x $seqlg;	
	    for( $i=0; $i<2; $i++) {
		$j=-1;
	    	while(1) {
	    		$j = index($$evocon_ve[$i],"x",$j+1);
	    		last if $j==-1;
	    		if( $i ) {
				push @cons, ($j + $seqlg1 + 3);
			}
			else {
				push @cons, $j;
			}
	    	}
		# add linker
		push @cons, ($seqlg1, $seqlg1+1, $seqlg1+2) if !$i;
	    }
 	    map{ $cons = substr($cons,0,$_)."s".substr($cons,$_+1) } @cons;

	    my $fname = "t$ID.fa";
	    my ($seqname_ve) = get_cofold_fasta_file($fname,$align_vema,$org_ve,"nnn");	

	    # to get constrained bases as COL-file
	    my $faheader = "th$ID.fa";
	    my $colheader = "th$ID.col";
	    system "echo '>structure' > $faheader";
	    system "echo '$cons' >> $faheader";
	    system "$Pfold/fasta2col $faheader | sed 's/arbitrary/RNA/g' > $colheader";

	    # to run pfold to get the phylogenetical probabilities of the input alignment 
	    my $colfile = "t$ID.col";
	    my $colfile2 = "tt$ID.col";
            my $mytree = "t2$ID.col";
	    my $mytree_ml = "t3$ID.col";
	    my $ppfile = "t$ID.pp";
	    my $pfoldout = "t$ID.out";
	    # to change the format
	    system "$Pfold/fasta2col $fname | sed 's/arbitrary/RNA/g' > $colfile";

	    # phylogenetical tree is given
	    if($SETTREE) {
		system "echo \"$SETTREE:0\" | $Pfold/newick2col > $mytree";
		# branch lengths are not given and are estimated by maximum likelihood
		if(index($SETTREE,":")==-1) {
	   		system "cat $colheader $colfile | $Pfold/nohead >> $mytree";
			system "$Pfold/mltree $Pfold/scfg.rate $mytree 2> /dev/null > $mytree_ml";
			system "$Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm $mytree_ml 2> /dev/null > $pfoldout";
		}
		# branch lengths are given
		else {
			#system "$Pfold/drawphyl --mult=5 $mytree > tree2.ps";
			system "cat $colheader $colfile | $Pfold/scfg --treefile --ppfile $mytree $ppfile $Pfold/article.grm 2> /dev/null > $pfoldout";
		}
	    }
	    # phylogenetical tree is not given and is estimated by neighbour joining approach
	    else {	
	    	# to find the phylogeny of the sequences using the neighbour joining approach
	    	# to estimate maximum likelihood of the branch lengths
	        system "cat $colfile | $Pfold/findphyl $Pfold/scfg.rate | $Pfold/mltree $Pfold/scfg.rate 2> /dev/null > $colfile2";
		# to calculate pairwise and single probabilities depending on the tree using Felsensteins dynamic programming for phylogenetic trees
		system "cat $colheader $colfile2 | $Pfold/scfg --treeinfile --ppfile $ppfile $Pfold/article.grm 2> /dev/null > $pfoldout";
	    }

	    my $seq1_l = $#{$align_vema->[0][0]};
	    open IN, $ppfile || die("Can not open the file!\n");
	    $l=<IN>;
	    # write paired probabilities in a 2D-array
	    for($i=0; $i<$l; $i++) {
		$line = <IN>;
		next if( $i>$seq1_l && $i<$seq1_l+4 );
		my @row = split " ", $line;
		splice @row, $seq1_l+1, 3;
		#map { $_=log($_)/log(2) if $_!=0; $_=$INFINITE if $_==0 } @row;
		push (@paired, \@row);
	    }
	    <IN>;
	    # write single probabilities in a 1D-array
	    $line = <IN>;
	    @single = split " ", $line;
	    splice @single, $seq1_l+1, 3;
	    # write probability of all possible structures
	    <IN>;
	    $totprob = <IN>;
	    chomp($totprob);
	    close IN;

	    # write Pfold predicted RNA secondary structure in a string
	    $struct = get_pfold_sec_struct($pfoldout);

	    system "rm -f $colfile $ppfile $pfoldout $faheader $colheader $fname $colfile2 $mytree $mytree_ml";
	}
	else {
	    $l=$#{$align_vema->[0][0]}+$#{$align_vema->[1][0]}+2;
 
	    for($i=0; $i<$l; $i++) {
		my @row;
		for($j=0; $j<$l; $j++) {
		    push @row, 0;
		}
		push (@paired, \@row);
		push (@single, 0);
	    }
	}

	return \@paired,\@single,$struct;	
}

# MEA-CYK to find the most reliable RNA duplex structure
# to change the RNA duplex input and call the function 'cyk'
sub cyk_cofold
{
	my ($align_vema,$scfg_haha,$cofold_paired_pet_ma,$cofold_single_pet_ve,$constr_ve) = @_;
	my (@offset,$i,$k,@single_pet,@paired_pet);
	
	my $seq1_l = $#{$align_vema->[0][0]}+1;
   	my @single_tmp = @$cofold_single_pet_ve;  

	# connect both sequences by the connector 'xxx' whereas each position has Pr_{single}=1 and Pr_{paired}=0 
	@offset = splice @single_tmp, $seq1_l;
	@single_pet = ( @single_tmp, 1, 1, 1, @offset );

	for( $i=0; $i<$#{$cofold_paired_pet_ma}+1; $i++ ) {
		if( $i==$seq1_l ) {
			my @row = ();
			for $k (0 .. $#{$cofold_paired_pet_ma}+4) {
				push @row, 0;
			}
			for $k (0 .. 2) {
				push @paired_pet, \@row;
			}
		}
		my @paired_tmp = @{$cofold_paired_pet_ma->[$i]}; 
		@offset = splice @paired_tmp, $seq1_l;
		my @row = ( @paired_tmp, 0, 0, 0, @offset ); 
		push @paired_pet, \@row; 
	}

	my $T_ref = cyk($scfg_haha,\@paired_pet,\@single_pet,$constr_ve); 	

 	return $T_ref;	
}

# to calculate the optimal secondary structure from alignment, tree and model (SCFG) by CYK-algorithm
# using phylogenetical and energy folding information
sub cyk
{
	my ($scfg_haha,$paired_pet_ma,$single_pet_ve,$consstr_ve) = @_;
	my (@T,$t,$i,$j,$prod,$s,$k,$pr,$bp);

	my $seqlg=$#$single_pet_ve+1;

	# to initialise cube T which holds arrays 
	# column($i=1,..,$seqlg+1-$j): base position;
	# row($j=1,...,$seqlg): length of subsequence;
	# depth($t): non terminals of SCFG: [0]=>'S', [1]=>'F', [2]=>'L'
	# array($a): [0]=>probability,
	#	     [1]=>reference to 1. child rule or infinite for rule L->s,
	#	     [2]=>reference to 2. child rule or infinite
	for( $i=0; $i<$seqlg; $i++ ) {
		for( $j=0; $j<$seqlg; $j++) {
			for( $t=0; $t<3; $t++) {
				$T[$i][$j][$t][0] = $INFINITE;
				$T[$i][$j][$t][1] = $INFINITE;
				$T[$i][$j][$t][2] = $INFINITE;
			}
		}
	}

	# to write first row of table T with probabilities of unpaired bases in the alignment
	for( $i=0; $i<$seqlg; $i++ ) {

	    #next if( $consstr_ve && $consstr_ve->[$i]!=-1 );
	    next if( $consstr_ve->[$i]!=$INFINITE && $consstr_ve->[$i]!=-1 );

	    # T[$i,0][L] := Pr_{rule}[L->s] x Pr_{single}[\vec{A}^i|T] x prod_{u=0}^{n-1} Pr_{unpaired}[A^i_u|s_u]
	    $T[$i][0][2][0] = $ALPHA * add($scfg_haha->{'L'}{'s'},$single_pet_ve->[$i]*2);

	    # T[$i,0][S] := Pr_{rule}[S->L] x T[$i,0][L]
	    $T[$i][0][0][0] = add($scfg_haha->{'S'}{'L'},$T[$i][0][2][0]);
	    $T[$i][0][0][1] = \$T[$i][0][2];
	}

	# to fill table T
	for( $j=1; $j<$seqlg; $j++ ) {
	    for( $i=0; $i<$seqlg-$j; $i++ ) {

		# rules S->LS & F->LS (standard CYK-algorithm)
		for( $k=1; $k<$j+1; $k++ ) {

		    next if( $T[$i][$k-1][2][0]==$INFINITE || $T[$i+$k][$j-$k][0][0]==$INFINITE );

		    # T[$i,$j][S] := max{ T[$i,$j][S], Pr_{rule}[S->LS] x T[$i,$k-1][L] x T[$i+$k,$j-$k][S] }
		    $pr = add(add($scfg_haha->{'S'}{'LS'},$T[$i][$k-1][2][0]),$T[$i+$k][$j-$k][0][0]);
		    if( max($pr,$T[$i][$j][0][0]) ) {
		    	$T[$i][$j][0][0] = $pr;
			$T[$i][$j][0][1] = \$T[$i][$k-1][2];
			$T[$i][$j][0][2] = \$T[$i+$k][$j-$k][0];
		    }		    
		    # T[$i,$j][F] := max{ T[$i,$j][F], Pr_{rule}[F->LS] x T[$i,$k-1][L] x T[$i+$k,$j-$k][S] }
		    $pr = add(add($scfg_haha->{'F'}{'LS'},$T[$i][$k-1][2][0]),$T[$i+$k][$j-$k][0][0]);
                    if( max($pr,$T[$i][$j][1][0]) ) {
		    	$T[$i][$j][1][0] = $pr;
			$T[$i][$j][1][1] = \$T[$i][$k-1][2];
			$T[$i][$j][1][2] = \$T[$i+$k][$j-$k][0];
		    }
		}

		# rule F->dFd (inner bond)
		# loop consists at least 3 unpaired bases
		if( $j>3 ) {

		    if( ($consstr_ve->[$i]==$INFINITE && $consstr_ve->[$i+$j]==$INFINITE) || ($T[$i+1][$j-2][1][0]!=$INFINITE && $consstr_ve->[$i]==$consstr_ve->[$i+$j] && $consstr_ve->[$i]!=-1) ) {
		    
			# T[$i,$j][F] := max{ T[$i,$j][F], Pr_{rule}[F->dFd] x $T[$i+1][$j-2][F] x Pr_{paired}[\vec{A}^i\vec{A}^{$i+$j}|T] x 
			# prod_{u=0}^{n-1} { Pr[(A^$i_u,A^{$i+$j}_u)|(A^{$i-1},$i+$j+1),s_u] if paired, 
			#                    Pr_{unpaired}[A^i_u|s_u] x Pr_{unpaired}[A^{$i+$j}_u|s_u] if unpaired } }
			$pr = add(add($scfg_haha->{'F'}{'dFd'},$T[$i+1][$j-2][1][0]),$paired_pet_ma->[$i][$i+$j]*2);
			if( max($pr,$T[$i][$j][1][0]) ) {
			    $T[$i][$j][1][0] = $pr;
			    $T[$i][$j][1][1] = \$T[$i+1][$j-2][1];
			    $T[$i][$j][1][2] = $INFINITE;
			}
		    }
		}

		# rule L->dFd (open bond)
		# loop consists at least 3 unpaired bases
		if( $j>3 ) {

		    if( ($consstr_ve->[$i]==$INFINITE && $consstr_ve->[$i+$j]==$INFINITE) || ($T[$i+1][$j-2][1][0]!=$INFINITE && $consstr_ve->[$i]==$consstr_ve->[$i+$j] && $consstr_ve->[$i]!=-1) ) {
			
			# T[$i,$j][L] := max{ T[$i,$j][L], Pr_{rule}[L->dFd] x $T[$i+1][$j-2][F] x Pr_{paired}[\vec{A}^i\vec{A}^{$i+$j}|T] x 
			# prod_{u=0}^{n-1} { Pr[(A^$i_u,A^{$i+$j}_u)|s_u] if paired, 
			#                    Pr_{unpaired}[A^i_u|s_u] x Pr_{unpaired}[A^{$i+$j}_u|s_u] if unpaired } }
			$pr = add(add($scfg_haha->{'L'}{'dFd'},$T[$i+1][$j-2][1][0]),$paired_pet_ma->[$i][$i+$j]*2);
			if( max($pr,$T[$i][$j][2][0]) ) {
			    $T[$i][$j][2][0] = $pr;
			    $T[$i][$j][2][1] = \$T[$i+1][$j-2][1];
			    $T[$i][$j][2][2] = $INFINITE;
			}
		    }
		}
		
		# rule S->L
		# T[$i,$j][S] := max{ T[$i,$j][S], Pr_{rule}[S->L] x T[$i,$j][L] }
		if( $T[$i][$j][2][0]!=$INFINITE ) {
		    $pr = add($scfg_haha->{'S'}{'L'},$T[$i][$j][2][0]);
		    if( max($pr,$T[$i][$j][0][0]) ) {
			$T[$i][$j][0][0] = $pr;
			$T[$i][$j][0][1] = \$T[$i][$j][2];
			$T[$i][$j][0][2] = $INFINITE;
		    }
		}
	    }
	}

	return \@T;
}

# to return maximum of values considering infinite values
# true if first value is maximum, else false
sub max
{
    my ($x,$y) = @_;
    my $max;

    if( $x!=$INFINITE && $y!=$INFINITE ) {
        $max = ($x>=$y) ? 1 : 0;
    }
    elsif( $y==$INFINITE ) {
        $max = 1;
    }
    else {
        $max = 0;
    }

    return $max;
}

# to return summand  of values considering infinite values
sub add
{
    my ($x,$y) = @_;

    ( $x!=$INFINITE && $y!=$INFINITE ) ? return $x+$y : return $INFINITE;
}

# backtracking through the CYK-table from T[0,$seqlength-1][S] to T[0..$seqlength-1,0][L]
# to get the consensus secondary structure of the alignment
sub backtracking
{
    my ($T_ref) = @_;
    	
    my $ss = 'S';
    my @rid = (0); 
    my @depth = (0);
    my $depth;

    # to get CYK-table entries of childs of rule S in T[0][n-1] 
    my @c1 = ( $T_ref->[0][$#{$T_ref->[0]}][0][1] );
    my @c2 = ( $T_ref->[0][$#{$T_ref->[0]}][0][2] );   

    while(1) {
	# leave the loop if all non-terminals are replaced by terminals
	last unless $ss=~/[SLF]/;

	# actual Non-terminal is S
	if( $rid[$#rid]==0 ) {
		# rule S->LS
		if( $c2[$#c2] != $INFINITE ) {
			$ss =~ s/S/LS/;

			# S rule
			push @rid, 0;
			push @c1, ${$c2[$#c2]}->[1];
			push @c2, ${$c2[$#c2]}->[2];

			# L rule
			push @rid, 2;
			push @c2, ${$c1[$#c1-1]}->[2];
			push @c1, ${$c1[$#c1-1]}->[1];
			$depth[$#depth]++;
			push @depth, 1;
			next;
		}
		# rule S->L
		else {
			$ss =~ s/S/L/;

			push @rid, 2;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
	}
	# actual Non-terminal is F
	elsif( $rid[$#rid]==1 ) {
		# rule F->LS
		if( $c2[$#c2] != $INFINITE ) {
			$ss =~ s/F/LS/;
	
			# S rule
			push @rid, 0;
			push @c1, ${$c2[$#c2]}->[1];
			push @c2, ${$c2[$#c2]}->[2];

			# L rule
			push @rid, 2;
			push @c2, ${$c1[$#c1-1]}->[2];
			push @c1, ${$c1[$#c1-1]}->[1];
			$depth[$#depth]++;
			push @depth, 1;
			next;
		}
		# rule F->dFd
		else {
			$ss =~ s/F/\(F\)/;

			push @rid, 1;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
    	}
	# actual Non-terminal is L
	else {
		# rule L->dFd
		if( $c1[$#c1] != $INFINITE ) {
			$ss =~ s/L/\(F\)/;

			push @rid, 1;
			push @c2, ${$c1[$#c1]}->[2];
			push @c1, ${$c1[$#c1]}->[1];
			$depth[$#depth]++;
			next;		
		}
		# rule L->s
		else {
			$ss =~ s/L/\./;
			$depth = pop @depth;
			for(my $i=0; $i<$depth; $i++) {
				pop @c1;
				pop @c2;
				pop @rid;
			}
			next;
		}
    	}
    }

    return $ss;
}

# returns a constraint string usable with 'RNAfold -C'
sub get_constraint_string
{
    my ($bp_left_ve,$bp_right_ve,$ss_ve,$length) = @_;
    my @string;
    my $j=0;
    my $string = "";

    for( my $i=0; $i<$length; $i++ ) {
	push @string,".";
    }

    map {$string[$_] = "<"} @$bp_left_ve;
    map {$string[$_] = ">"} @$bp_right_ve;
    map {$string[$_] = "x"} @$ss_ve;

    map {$string .= $_} @string;
    
    return $string;
}

# scan both alignments and take maximal common amount of identifiers (organisms)
# compare only identifier before first occurrence of a dot
# die if less than 3 organisms are common
sub compare_arrays
{
    my ($array1_ma, $array2_ma) = @_;
    my ($org1, $org2, @seqname, @align1, @align2);  

    for( my $i=0; $i<$#{$array1_ma}+1; $i++ ) {
	$org1 = ( index($array1_ma->[$i][0],"\.")>-1 ) ? ( split /\./, $array1_ma->[$i][0] )[0] : $array1_ma->[$i][0];
	for( my $j=0; $j<$#{$array2_ma}+1; $j++ ) {
		$org2 = ( index($array2_ma->[$j][0],"\.")>-1 ) ? ( split /\./, $array2_ma->[$j][0] )[0] : $array2_ma->[$j][0];
		if( $org1 eq $org2 ) { 
			push @seqname, $org1;
			my @arraycopy1 = @{$$array1_ma[$i]}; 
			push @align1, \@arraycopy1;
			my @arraycopy2 = @{$$array2_ma[$j]}; 
			push @align2, \@arraycopy2;
		}
	}
    }

    map { shift @$_ } @align1; 
    map { shift @$_ } @align2;

    &usage2 if $#seqname<2;

    return \@seqname, \@align1, \@align2;
}

# calculates single-stranded probabilities of the PETfold model
sub get_pet_single
{
    my ($single_tree_ve, $single_seq_ma) = @_;
    my (@P, $prod);
    my $seqlg = $#{$single_tree_ve}+1;

    for(my $i=0; $i<$seqlg; $i++) {
    	$P[$i] = add($single_tree_ve->[$i], $BETA * $$single_seq_ma[$i]) / 2;
    }

    return \@P;
}

# calculates base-paired probabilities of the PETfold model
sub get_pet_paired
{
    my ($paired_tree_ma, $paired_seq_vema) = @_;
    my (@P, $prod, $i, $j);
    my $seqlg = $#{$paired_tree_ma}+1;

    for( $i=0; $i<$seqlg; $i++ ) {
	for( $j=0; $j<$seqlg; $j++) {
		$P[$i][$j] = $INFINITE;
	}
    } 	

    for( $j=1; $j<$seqlg; $j++ ) {
    	for( $i=0; $i<$seqlg-$j; $i++ ) {
		$P[$i][$i+$j] = add($paired_tree_ma->[$i][$i+$j], $BETA * $paired_seq_vema->[$i][$i+$j]) / 2;
 	}
    }	

    return \@P;
}


# merging of reliability 0 for bases with large dublex base-paired reliability (fold_ss_ma)
# and single-stranded dublex reliabilities (cofold_single_ve) for bases with low dublex base-paired reliability (NOT fold_ss_ma)
sub merge_single_fold_cofold_pet
{
    my ($cofold_single_ve, $fold_single_ma, $fold_ss_ma) = @_;
    my (@fold_ss, $i, @P);

    foreach $i ( 0 .. $#$cofold_single_ve ) { $P[$i] = $$cofold_single_ve[$i] };

    #set intramolecular basepair constraints as reliability 0
    for( $i=0; $i<$#{$$fold_ss_ma[1]}+1; $i++ ){
	push @fold_ss, $fold_ss_ma->[1][$i] + $#{$$fold_single_ma[0]}+1;
    }
    for( $i=0; $i<$#{$$fold_ss_ma[3]}+1; $i++ ){
	push @fold_ss, $fold_ss_ma->[3][$i] + $#{$$fold_single_ma[0]}+1;
    }
    foreach $i ( @{$$fold_ss_ma[0]},@{$$fold_ss_ma[2]},@fold_ss ){
	    $P[$i] = 0;
    }
 
    return \@P;
}


# merging of probability of partial structures for bases with large intramolecular base-paired reliability (NOT fold_ss_ma)
# and base-paired dublex reliabilities (cofold_paired_ma) for bases with low intramolecular base-paired reliability (fold_ss_ma)
sub merge_paired_fold_cofold_pet
{
    my ($cofold_paired_ma, $fold_paired_vema, $thermopartprob_ma, $evolpartprob_ma, $intramol_bp_open_ma, $intramol_bp_close_ma) = @_;
    my (@P, $i, $j, $offset);

    my $seqlg = $#{$$cofold_paired_ma[0]}+1;

    for( $i=0; $i<$seqlg; $i++ ) {
	for( $j=0; $j<$seqlg; $j++) {
		if( $i<$j ) {
  			$P[$i][$j] = $cofold_paired_ma->[$i][$j];
			$P[$j][$i] = $cofold_paired_ma->[$i][$j];
		}
		elsif( $i==$j ) {
			$P[$i][$j] = 0;
		}
	}	
    } 	

    # dublex reliabilities sequence 1 - multiplied with partial structure probability of sequence 1
    foreach my $k ( 0 .. 1 ) {
    	$j=0;
	$offset=0;
	if( $k ) {
		$offset = $#{$fold_paired_vema->[0][0]}+1;
	}
    	foreach $i ( @{$$intramol_bp_open_ma[$k]} ){
		$P[$i+$offset][$intramol_bp_close_ma->[$k][$j]+$offset] = add($$evolpartprob_ma[$k], $BETA * $$thermopartprob_ma[$k]) / 2;
		$P[$intramol_bp_close_ma->[$k][$j]+$offset][$i+$offset] = add($$evolpartprob_ma[$k], $BETA * $$thermopartprob_ma[$k]) / 2;
		$j++;
	}
    }

    return \@P;
}


# returns a list of the numbers 0 to <lastindex> excluding the numbers in <fold_ss_ve>
sub inverse_list
{
    my ($fold_ss_ve, $lastindex) = @_;
    my @fold_notss;
    my $k=0;

    @$fold_ss_ve = sort( @$fold_ss_ve );
    for my $i (0 .. $lastindex){
	if( defined $$fold_ss_ve[$k] && $i==$$fold_ss_ve[$k] ){
		if( $k<$#{$fold_ss_ve} ) {
			$k++;
		}
	}
	else{
		push @fold_notss, $i;
	}
    }

    return \@fold_notss;
}

 
# returns only the intermolecular base pairs
sub get_intermol_struct
{
    my ($all_open_ve, $all_close_ve, $length1, $length2) = @_;
    my (@interss, $i);

    push @interss, "." x $length1;
    push @interss, "." x $length2;

    for($i=0; $i<$#$all_open_ve+1; $i++) {
        if( $$all_open_ve[$i]<$length1 && $$all_close_ve[$i]>=$length1 ) {
                $interss[0] = substr($interss[0],0,$$all_open_ve[$i])."(".substr($interss[0],$$all_open_ve[$i]+1);
                $interss[1] = substr($interss[1],0,$$all_close_ve[$i]-$length1).")".substr($interss[1],$$all_close_ve[$i]-$length1+1);
        }
    }

    return \@interss;
}


# calculates the score of an RNA structure
sub get_score
{
    my ($struct, $paired_ma, $single_ve) = @_;
    my (@stack, @pseudostack, $j);
    my $score = 0;

    $struct=~s/[&-]//g;
    my @e = split "",$struct;
    for( my $i=0; $i<$#e+1; $i++ ) {
	if( $e[$i] eq "." ) {
		$score += 2 * $ALPHA * $single_ve->[$i];
	}
	elsif( $e[$i] eq "(" ) {
		unshift @stack, $i;	
	}
	elsif( $e[$i] eq ")" ) {
		$j = shift @stack;
		$score += 2 * $paired_ma->[$j][$i];
	}
	elsif( $e[$i] eq "{" ) {
		unshift @pseudostack, $i;	
	}
	elsif( $e[$i] eq "}" ) {
		$j = shift @pseudostack;
	$score += 2 * $paired_ma->[$j][$i];
	}
    }
    return $score;
}


# print the probabilities in a file that can be printed as dotplot using 'drawplot' from Bjarne Knudsen
sub create_ppfile 
{
    my ($paired_ma, $single_ve, $ppfile) = @_;
    my $seqlg = $#{$single_ve}+1;

    open(FILE, ">$ppfile") || die("Can not open the file!\n");
    print FILE $seqlg . "\n";
    for(my $i=0; $i<$seqlg; $i++){
    	for(my $j=0; $j<$seqlg; $j++){
		if( $paired_ma->[$i][$j] == $INFINITE ) {
			if( $paired_ma->[$j][$i] == $INFINITE ) {
				print FILE "0\t";
			}
			else {
				print FILE $paired_ma->[$j][$i] . "\t";
			}
		}
		else {
			print FILE $paired_ma->[$i][$j] . "\t";
		}
	}
	print FILE "\n";
    }
    print FILE "\n";
    map { print FILE "$_\t" } @{$single_ve};    
    close(FILE);

    return 1;
}


# to modify the desired structure as constraints and
# to test the accordance of desired structure to the gap columns (gap must be a unpaired position!!!)
sub mod_setstruct
{
    my ($setstr, $gap_col) = @_;

    $setstr =~ s/\./x/g;
    $setstr =~ s/\(/</g;
    $setstr =~ s/\)/>/g;
    my $tmpstr=$setstr;

    foreach my $k ( reverse @{$gap_col} ){
	die("Consensus structure invalid: wrong length!\n") if $k >= length($tmpstr);
	if( substr($tmpstr,$k,1) ne "x" ){
	    die("The desired RNA structure can not be evaluated! Position $k has to be unpaired because an alignment gap.\n");
	}else{
	    $setstr=substr($setstr,0,$k).substr($setstr,$k+1);
        }
    }

    return $setstr;
}


# read tree and structure file 
sub read_file
{
    my ($file) = @_;
    open IN, $file || die("Can not open file!\n");
    my $l=<IN>;
    chomp $l;
    close IN;
    return $l;
}


# check if the given phylogentetic tree is valid
sub checkTree
{
	my ($tree, $seqname_ve) = @_;
	my ($open, $close, @a, @b, @f, @org, $msg);

	if( length($tree) ) {
		$open=$tree=~s/\(//g;
		$close=$tree=~s/\)//g;
		if( $open != $close ) {
			print "Phylogenetic tree invalid: number of left and right parenthesis unequal!\n";
			exit 1;
		}
		@a=split ":",$tree;
		map{@b=split ",";push @f,@b}@a;
		@a=@f;@f=();
		map{@b=split /\)/;push @f,@b}@a;
		@a=@f;@f=();
		map{@b=split /\(/;push @f,@b}@a;
		map{push @org, $_ if /^[A-Z]/i}@f;
		$msg="Phylogenetic tree invalid: different identifiers (organisms) as in the multiple sequence alignments!\n";
		&checkIdentifier($seqname_ve, \@org, $msg);
	}

	return 1;
}


# check if two lists of identfiers are equal and print a message if not
sub checkIdentifier
{
	my ($nref, $orgref, $msg) = @_;

	my @sortn=sort @$nref;
	my @sortorg=sort @$orgref;
	if( $#sortn != $#sortorg ) {
		print $msg;
		exit 1;
	}
	for(my $i=0;$i<$#sortn+1;$i++) {
		if( $sortn[$i] ne $sortorg[$i] ) {
			print $msg;
			exit 1;
		}
	}

	return 1;
}


# create unique ID for temporary files
# (Attention: RNAfold do not create a dotplot file if _ or - is contained in the fasta header)
sub getUniqueID 
{
    my $pid = "$$"; 			# current PID
    $pid = "a" . $pid; 			# attach a letter to make it a string
    my @l = split(//,$pid);	                # split and get the last 4 numbers (or the last 3 with the 'a')
    $l[$#l-3] = "0" if $l[$#l-3] eq "a";
    my $ID = $l[$#l-3] . $l[$#l-2] . $l[$#l-1] . $l[$#l]; 	# 4digit ID of this process
    my $host = `hostname`;
    $host =~ s/[-\.]//g;
    $ID .= substr($host, 0, 5);		# first 5 characters of hostname (allows numbering of ID from 0 to 999 for RNAfold)

    return $ID;
}


sub numerically { $a <=> $b }

sub usage
{
   die("PETcofold v3.1\n" .
       "==============\n" .
       "by Stefan E Seemann (seemann\@genome.ku.dk)\n" .
       "\n" .	
       "Usage: PETcofold.pl -fasta <file1> -fasta <file2> [ options ] [ parameter settings ]\n" .
       "\n" .
       "Mandatory Input:\n" .
       "   -fasta <file1>  -fasta <file2>    ... 2 alignments with same organisms in fasta format\n" .
       "Options:\n" .
       "   -settree <tree>		     ... calculates score for given tree in Newick tree format\n" .
       "   -setstruct1 <structure>	     ... finds highly reliable base pairs in given structure of first alignment (Dot-bracket notation)\n" .
       "   -setstruct2 <structure>	     ... finds highly reliable base pairs in given structure of second alignment (Dot-bracket notation)\n" .
       "   -war			     	     ... fasta format output\n" .
       "   -intermol			     ... structure output of inter-molecular base pairs\n" .
       "   -ppfile <file>		     ... writes PET probabilities in pp-file\n" .
       "                                         which can be drawn (as ps) using 'programs/pfold/bin/drawdot'\n" .
       "Parameter settings:\n" .    
       "   -setgap <nr>                      ... max. percent of gaps in alignment column (default: 0.25)\n" .
       "   -setpetcon <reliability>          ... max. intra-mol. base pair rel. to be free for RNA-RNA interaction (default: 0.9)\n" .
       "   -setpartprob <probability>	     ... minimal partial structure probability (default: 0.1)\n" .
       "   -noLP			     ... RNA(co)fold option: disallows pairs that can only occur isolated\n" .
       "   -extstem			     ... constraint stems get extended by inner and outer base pairs\n" .
       "PETfold specific parameters:\n" .
       "   -setevocon_bp <reliability>       ... rel. threshold for conserved base pairs (default: 0.9)\n" .
       "   -setevocon_ss <reliability>       ... rel. threshold for conserved single stranded pos. (default: 1)\n" .
       "   -setalpha <nr>                    ... weighting factor for single stranded probs (default: 0.2)\n" .
       "   -setbeta <nr>                     ... weighting factor for thermodynamic overlap (default: 1)\n" .
       "Experimental parameters:\n" .
       "   -setbetaduplex <nr>		     ... weighting factor for thermodynamic overlap in RNA duplex folding - step 2 (default: 1)\n" .
       "					 if the average reliability of the extended stem is larger than 'setpetcon'\n" .
       "   -mrnanotconstr 		     ... mRNA (second sequence by convention) is not constrained in RNA cofolding - step 2\n" .
       "   -partprob_and		     ... thermodynamic AND evolutionary partial structure probability have to be larger than 'setpartprob'\n" .
       "                                         by default OR\n" );
}

sub usage1
{
   die("Have you forgotten to set the right environmental variables for RNAfold as well as Pfold?\n" .
       "\n" .
       "In a bash the commands would be:\n" .
       "\$ export RNAfold_bin='<path_to_RNAfold>'\n" .
       "\$ export Pfold_bin='<path_to_Pfold>'\n\n");
}

sub usage2
{
   die("Both alignments have to have at least three common identifiers\n" .
       "(text before the first occurrence of a dot)\n");
}

