#!/usr/bin/perl -w
use strict;
use Getopt::Std;
use Data::Dumper;
use Cwd;

my %opts = ();
getopts('f:o:c:i:s:x:n:l:', \%opts);

sub usage {
  $0 =~ s/.*\///;
  print STDERR "Usage : $0 <-f fasta file> <-o output name> 

        Optional arguments:
                            -c <on|off> perform clustering (default off)
                            -i <existing foldalign output>
                            -s <score cutoff for clustering> (default 100)
                            -n <on|off> pruning (default on)
                            -l <on|off> Also output column format <default off>
                            -x <java heap size in MB>] (default 500)\n\n";
  exit -1;
}

#This makes the program stop if one of the options is missing
usage()if (! $opts{'f'} || ! $opts{'o'});
open (FILE, $opts{'f'}) || die "Could not open file : $opts{'t'} : $!";

if(! (defined $opts{'c'})){
    $opts{'c'} = "off";
}
if(! (defined $opts{'l'})){
    $opts{'l'} = "off";
}
if(! (defined $opts{'n'})){
    $opts{'n'} = "on";
}
my $heapsize = "500m";
if(defined $opts{'x'}){
    $heapsize = $opts{'x'} ."m";
}

my $i = 0;my $j = 0; my $k;
my $m = 0;my $n = 0;
my (%allIds,@allIds,$key1,$key2,@files);
my ($id, $string);
#I just use this to get a list of all the ids, i.e. keys %allIds
#The value 1 is not important

if(! (-d ".fold_foldalign")){
    system "mkdir .fold_foldalign";
}

###################PATH#########################

my $path = getcwd;
my $fileName;
if(defined $opts{'i'}){
    $fileName = $opts{'i'};
}else{
    $fileName = ".fold_foldalign/".$opts{'o'} .".gz";
}
my $fastaFile = $opts{'f'};
################################################

###################Run FOLDALIGN on the fasta file####################################
if(! (defined $opts{'i'})){
    system "foldalign -global $fastaFile | gzip > " . $fileName;
}
###################Read the sequences from the .tab file###############################
my (%fasta);
while(<FILE>){
    $string = $_;
    if($string =~ /^>(.*?)\s+.*$/){
 	$id = $1;
    }else{
	chomp $string;
	$fasta{$id} .= $string;
    }
}

#my $alleFiler = `ls -1 $path/FoldalignM/simple`;
#my @alleFiler = split(/\n/,$alleFiler);

######################Read the foldalign output and gather relevant information##################
my $c = 0; my $c2 = 0;
my @foldA; my @foldB;
my $status ="";
my $hit = -1;
my @info;
my @ct;my @ct2; #Alignment length for each hit
#foreach my $f (@alleFiler){
#    $status = "";
#    $f = $path . "FoldalignM/simple/" . $f;
$fileName =~ s/(.*\.gz)\s*$/gunzip \-c \< $1|/;
open (FIL, $fileName) || die "Could not open file : $fileName : $!";
while(<FIL>){
    $string = $_;
    if($string =~ /^; NOTE\s+No global alignment was found.*$/){
	#$hit--;
	$status = "";
	#print STDERR "No global alignment in file $f\n";$info[$hit][0]=0;
	next;
    }
    if($string =~ /^; FOLDALIGN\s+.*$/ && ($status eq "" || $status eq "foldB")){$status = "score"; $hit++; $c = 0; $c2=0;}
    if($string =~ /^; ALIGNING\s+(.*)\s+against\s+(.*)$/ && $status eq "score"){$info[$hit][1] = $1;$info[$hit][5] = $2;}
    #if($string =~ /^; ALIGNMENT_ID\s+(.*?)\s+(.*)$/ && $status eq "score"){$info[$hit][1] = $1;$info[$hit][5] = $2;}
    if($string =~ /^; =*?$/ && $status eq "score"){$status = "hitA";}
    if($string =~ /^; -*?$/ && $status eq "hitA"){$status = "fold";}
    if($string =~ /^; \**?$/ && $status eq "fold"){$status = "hitB"; $ct[$hit] = $c;}
    if($string =~ /^; -*?$/ && $status eq "hitB"){$status = "foldB";}
    if($string =~ /^; \**?$/ && $status eq "foldB"){$ct2[$hit] = $c2;}
    if($string =~ /^[NG]\s+([ACGUT-])\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*)$/ && $status eq "fold"){
	#$foldA[$hit][$c][0] = $1;
	$foldA[$hit][$c][1] = $2;
	#$foldA[$hit][$c][2] = $3;
	#$foldA[$hit][$c][3] = $4;
	$foldA[$hit][$c][4] = $5;
	$c++;
    }
    
    if($string =~ /^[NG]\s+([ACGUT-])\s+(.*?)\s+(.*?)\s+(.*?)\s+(.*)$/ && $status eq "foldB"){
	#$foldB[$hit][$c2][0] = $1;
	$foldB[$hit][$c2][1] = $2;
	#$foldB[$hit][$c2][2] = $3;
	#$foldB[$hit][$c2][3] = $4;
	$foldB[$hit][$c2][4] = $5;
	$c2++;
    }
    
    if($string =~ /^; ALIGN\s+Score:\s(\-\d+|\d+)$/ && $status eq "score"){$info[$hit][0] = $1;}
    #if($string =~ /^; ALIGNMENT_ID\s+(.*?)\s+.*$/ && $status eq "hitA"){$info[$hit][1] = $1;}
    #if($string =~ /^; ENTRY\s+(.*)$/ && $status eq "hitA"){$info[$hit][1] = $1;}
    if($string =~ /^; START_POSITION\s+(.*)$/ && $status eq "hitA"){$info[$hit][2] = $1;}
    if($string =~ /^; END_POSITION\s+(.*)$/ && $status eq "hitA"){$info[$hit][3] = $1;}
    if($string =~ /^; ALIGNMENT_LENGTH\s+(.*)$/ && $status eq "hitA"){$info[$hit][4] = $1;}
    #if($string =~ /^; SEQUENCE_LENGTH\s+(.*)$/ && $status eq "hitA"){$info[$hit][4] = $1;}
    #if($string =~ /^; ALIGNMENT_ID\s+.*?\s+(.*)$/ && $status eq "hitA"){$info[$hit][5] = $1;}
    #if($string =~ /^; ENTRY\s+(.*)$/ && $status eq "hitB"){$info[$hit][5] = $1;}
    if($string =~ /^; START_POSITION\s+(.*)$/ && $status eq "hitB"){$info[$hit][6] = $1;}
    if($string =~ /^; END_POSITION\s+(.*)$/ && $status eq "hitB"){$info[$hit][7] = $1;}
    #if($string =~ /^; SEQUENCE_LENGTH\s+(.*)$/ && $status eq "hitB"){$info[$hit][8] = $1;}
}
close(FIL);
#}

my %bp; my %info;my %numberOfGood;
my ($idA, $idB, $idAB, $idBA);
my $score;
my ($X, $Y);

for ($i=0; $i<@info; $i++)
{
    $idA = $info[$i][1];
    $idB = $info[$i][5];
    $idAB = $idA . ":" . $idB;
    $idBA = $idB . ":" . $idA;
    
    if((!(defined $info[$i][0])) || $info[$i][0] == 0){
	$info{$idAB}{'score'} = 0;
	$info{$idBA}{'score'} = 0;
	$info{$idAB}{'length'} = 0;
	$info{$idBA}{'length'} = 0;
	next;
    }
    
    $info{$idAB}{'startA'} = $info[$i][2];
    $info{$idAB}{'stopA'} = $info[$i][3];
    $info{$idAB}{'startB'} = $info[$i][6];
    $info{$idAB}{'stopB'} = $info[$i][7];
    $info{$idAB}{'length'} = $info[$i][4];
    $info{$idAB}{'score'} = $info[$i][0];
    
    $info{$idBA}{'startB'} = $info[$i][2];
    $info{$idBA}{'stopB'} = $info[$i][3];
    $info{$idBA}{'startA'} = $info[$i][6];
    $info{$idBA}{'stopA'} = $info[$i][7];
    $info{$idBA}{'length'} = $info[$i][4];
    $info{$idBA}{'score'} = $info[$i][0];
    
    #Here $X basepairs with $Y
    for ($j=0; $j<$ct[$i]; $j++)
    {
	$X =  $foldA[$i][$j][1];
	$Y =  $foldA[$i][$j][4];
	$bp{$idA}{$idB}{$X} = $Y;
	$X =  $foldB[$i][$j][1];
	$Y =  $foldB[$i][$j][4];
	$bp{$idB}{$idA}{$X} = $Y;
    }
}

#####################################CLUSTERING#################################################

my @clusters;
my $antalCluster;
my %removed;
my @names = keys %bp;
my $score_cutoff = 100;
if($opts{'c'} eq "on"){
    if(! (defined $opts{'s'})){
	$score_cutoff = 100;
    }else{
	$score_cutoff = $opts{'s'};
    }

    if (! (-d ".fold_out"))
    {
	system "mkdir .fold_out";
    }

    open(CLUSTER, ">.fold_out/".$opts{'o'}.".cluster.info") || die "can't open cluster.info";

    my (@scores,@allScores);
    my (@lengths,@allLengths);
    my (%scores,%lengths);
    my @first;
    my @second;

    for(my $i=0; $i<@names;$i++){
	#print "$names[$i]\n";
	for(my $j=$i+1; $j<@names;$j++){
	    my $key = $names[$i] . ":" . $names[$j];
	    $scores[$i+1][$j+1] = $info{$key}{'score'};
	    $lengths[$i+1][$j+1] = $info{$key}{'length'};
	    $allScores[$i+1][$j+1] = $info{$key}{'score'};
	    $allScores[$j+1][$i+1] = $info{$key}{'score'};
	    $allLengths[$i+1][$j+1] = $info{$key}{'length'};
	    $allLengths[$j+1][$i+1] = $info{$key}{'length'};
	    #if($i<10 && $j<10){
	    #    print "$key " . ($i+1) ." ". ($j+1)." $info{$key}{'score'} $info{$key}{'length'}.\n";
	    #}
	}
    }

    for(my $i=1;$i<@scores;$i++){
	for(my $j=$i+1;$j<@{$scores[$i]};$j++){
	    push @{$scores{$scores[$i][$j]}},"$i-$j";
	    $lengths{"$i-$j"} = $lengths[$i][$j];
	    #print "$i $j $scores[$i][$j]\n";
	}
    }

    my $clusters = 0;

    my ($check,$checki,$checkj,$clusteri,$clusterj,%betweenClusters);
    foreach my $k (sort num keys %scores){
	if($k < $score_cutoff){next;}
	@first = @{$scores{$k}};
      PAIR:foreach my $v (@first){
	    if($k < $score_cutoff+$lengths{$v}){next;}
	    $check = 0; $checki = 0; $checkj = 0;
	    ($i,$j) = split("-",$v);
	    if($clusters == 0){
		push @{$clusters[$clusters]}, $i;
		push @{$clusters[$clusters]}, $j;
		$clusters++;
	    }else{
		for(my $c=0; $c<@clusters;$c++){
		    for(my $cc=0;$cc<@{$clusters[$c]};$cc++){
			if($i == $clusters[$c][$cc]){
			    $checki = 1;
			    $clusteri = $c;
			    $check = 1;
			    #print "HMM: $i - $j - $c - $cc - $clusters[$c][$cc]\n";
			    #next PAIR;
			}
			if($j == $clusters[$c][$cc]){
			    $checkj = 1;
			    $clusterj = $c;
			    $check = 1;
			    #print "HMM: $i - $j - $c - $cc - $clusters[$c][$cc]\n";
			    #next PAIR;
			}
		    }
		}
		if($checki && !$checkj){
		    push @{$clusters[$clusteri]}, $j;
		    #next PAIR;
		}
		if(!$checki && $checkj){
		    push @{$clusters[$clusterj]}, $i;
		    #next PAIR;
		}
		if($checki && $checkj && ($clusteri != $clusterj)){
		    #print "$clusteri  $clusterj  $k\n";
		    $betweenClusters{$clusteri}{$clusterj} = 1;
		    $betweenClusters{$clusterj}{$clusteri} = 1;
		}
		if($check == 0){
		    push @{$clusters[$clusters]}, $i;
		    push @{$clusters[$clusters]}, $j;
		    #print "New Cluster: $i - $j - $clusters --- $k\n";
		    $clusters++;
		}
	    }
	}
    }

    $antalCluster = scalar @clusters;
    print CLUSTER "Initial number of clusters: $antalCluster\n";
    my ($perClusterA,$perClusterB,$scoresbetweenClusters,$antalbetweenClusters,$lengthsbetweenClusters);
    %removed = ();
    for(my $i=0;$i<@clusters;$i++){
	if($removed{$i}){next;}
	if($i != $antalCluster-1){
	    for(my $k=$i+1;$k<@clusters;$k++){
		if($removed{$k}){next;}
		if(defined $betweenClusters{$i}{$k}){
		    $perClusterA = scalar @{$clusters[$i]};
		    $perClusterB = scalar @{$clusters[$k]};
		    $antalbetweenClusters=0;
		    $scoresbetweenClusters=0;
		    $lengthsbetweenClusters=0;
		    for(my $l=0;$l<$perClusterA; $l++){
			for(my $j=0;$j<$perClusterB;$j++){
			    #print "Between clusters: $i - $k : $clusters[$i][$l] - $clusters[$k][$j] ::  $allScores[$clusters[$i][$l]][$clusters[$k][$j]]\n";
			    $scoresbetweenClusters += $allScores[$clusters[$i][$l]][$clusters[$k][$j]];
			    $lengthsbetweenClusters += $allLengths[$clusters[$i][$l]][$clusters[$k][$j]];
			    $antalbetweenClusters++;
			}
		    }
		    #print "Cluster $i $k $betweenClusters - $perClusterA - $perClusterB\n";
		    #$betweenClusters /= ($perClusterA * $perClusterB);
		    #$scoresbetweenClusters /= $antalbetweenClusters;
		    if($scoresbetweenClusters > ($antalbetweenClusters*($score_cutoff/2)+$lengthsbetweenClusters)){
		    #if($scoresbetweenClusters > 0){
			print CLUSTER "Merging $i and $k : $scoresbetweenClusters ($antalbetweenClusters) ($lengthsbetweenClusters)\n";
			@{$clusters[$i]} = (@{$clusters[$i]},@{$clusters[$k]});
			$removed{$k} = 1;
			for(my $j=$k+1;$j<@clusters;$j++){
			    if(defined $betweenClusters{$k}{$j}){
				#print "Adding $k - $j : $betweenClusters{$k}{$j} to $i - $j : $betweenClusters{$i}{$j}\n";
				$betweenClusters{$i}{$j} += $betweenClusters{$k}{$j};
			    }
			}
		    }else{
			print CLUSTER "Not merging $i and $k : $scoresbetweenClusters ($antalbetweenClusters) ($lengthsbetweenClusters)\n";
		    }
		}
	    }
	}
    }

    $antalCluster = 0;
    for(my $i=0;$i<@clusters;$i++){
	if($removed{$i}){next;}
	$antalCluster++;
	print CLUSTER "Cluster number $antalCluster : " . scalar(@{$clusters[$i]}) . " members\n";
	for(my $j=0;$j<@{$clusters[$i]};$j++){
	    #if($removed{$j}){next;}
	    #push @{$scores{$scores[$i][$j]}},"$i-$j";
	    #print CLUSTER "$i $j $clusters[$i][$j] = $names[($clusters[$i][$j])-1]\n";
	    print CLUSTER "$names[($clusters[$i][$j])-1]\n";
	}
    }
    print CLUSTER "Number of clusters after merging: $antalCluster\n";
    sub num {$b <=> $a;}
    #exit;
}
############################################################################################


##################Map the the base pairing for each sequence and each position###############
if($opts{'c'} eq "on"){$antalCluster = scalar (@clusters);}else{$antalCluster = 1;}
my $prettyCount = 1;
for(my $cluster=0;$cluster<$antalCluster;$cluster++){
    if($opts{'c'} eq "on"){
	if($removed{$cluster}){next;}
    }

    ##############For each sequence count the number of base pairing at each position########
    my $seq;my @seq;
    my $bp;
    my %count;			#Count of basepairs for each position
    my %count2;
    my %basepair;		# == "("
    my %basepair2;		# == ")"
    my $counter;

    if($opts{'c'} eq "on"){
	foreach my $w (@{$clusters[$cluster]}) {
	    #if($w eq $sorted[0] || $w eq $sorted[1] || $w eq $sorted[2]){next;}
	    $idA = $names[$w-1];
	    $seq = $fasta{$idA};
	    @seq = split('', $seq);
	    #foreach my $q (sort keys %{ %bp->{$w} }) {
	    foreach my $q (@{$clusters[$cluster]}) {
		#if($q eq $sorted[0] || $q eq $sorted[1] || $q eq $sorted[2]){next;}
		$idB = $names[$q-1];
		for($n=0; $n<@seq; $n++){
		    if($bp{$idA}{$idB}{$n}){
			$bp = $bp{$idA}{$idB}{$n};
			if($bp eq "."){
			    next;
			}else{
			    if($bp > $n){
				$count{$idA}[$n]++;
				$basepair{$idA}[$n][$bp]++;
			    }else{
				$count{$idA}[$n]++;
				$basepair2{$idA}[$bp][$n]++;
			    }
			}
		    }
		}
	    }
	}
    }else{
	foreach my $w (sort keys %bp) {
	    $seq = $fasta{$w};
	    @seq = split('', $seq);
	    foreach my $q (sort keys %{$bp{$w}}) {
		#if($q eq $sorted[0] || $q eq $sorted[1] || $q eq $sorted[2]){next;}
		for ($n=0; $n<@seq; $n++)
		{
		    if ($bp{$w}{$q}{$n})
		    {
			$bp = $bp{$w}{$q}{$n};
			if ($bp eq ".")
			{
			    next;
			} else {
			    if ($bp > $n)
			    {
				$count{$w}[$n]++;
				$basepair{$w}[$n][$bp]++;
			    } else {
				$count{$w}[$n]++;
				$basepair2{$w}[$bp][$n]++;
			    }
			}
		    }
		}
	    }
	}
    }

    ###########################Print the base pair probabilities#############################
    my @ids = sort (keys %count);
    my $ids = scalar @ids;
    my $count;
    my $index;
    my $subLength;

    if (! (-d ".fold_matrix"))
    {
	system "mkdir .fold_matrix";
    }
    my $outfiles = ".fold_matrix/" . $opts{'o'};
    if (! (-d $outfiles))
    {
	system "mkdir $outfiles";
    }

    if($opts{'c'} eq "on"){
	$outfiles .= "/" . $prettyCount;
	if (! (-d $outfiles))
	{
	    system "mkdir $outfiles";
	}
    }

    open(SM, ">".$outfiles."/SM.out") || die "can't open SM.out";
    my $antalSekvenser = scalar(@ids);
    print SM $antalSekvenser . "\n";
    foreach my $k (@ids)
    {
	print SM "$k ";
    }
    print SM "\n";
    for (my $i=0; $i<@ids;$i++)
    {
	for (my $j=$i+1; $j<@ids;$j++)
	{
	    my $key = $ids[$i] . ":" . $ids[$j];

	    my $sm_score;
	    if(defined $info{$key}{'score'}){
		$sm_score = $info{$key}{'score'};
	    }else{
		$sm_score = 1;
	    }

	    if ($sm_score < 0)
	    {
		$sm_score = 1;
	    }
	    print SM "" . ($i+1) ." ". ($j+1)." ".$sm_score . "\n";
	}
    }
    close (SM);

    $count = 1;

    foreach my $v (@ids)
    {
	my $seq = $fasta{$v};
	my $seqLength = length($seq);
	open(DOT, ">".$outfiles."/".$count.".out") || die "can't open $count.out";
	$count++;
	print DOT "Sequence:";
	print DOT $fasta{$v} . "\n";
	for (my $i=0; $i <= $seqLength-1; $i++)
	{
	    for (my $j=$i+1; $j <= $seqLength; $j++)
	    {
		if ($basepair{$v}[$i][$j])
		{
		    my $baseScore = ($basepair{$v}[$i][$j]/($ids-1));
		    printf DOT "%d %d %1.7f ubox\n", $i, $j, $baseScore;
		}
	    }
	}
	close (DOT);
    }
    my $execute = "java -server -Xmx".$heapsize." src.FoldalignM"; 
    if($opts{'n'} eq "off"){$execute .= " -no_pruning";}
    if($opts{'l'} eq "on"){$execute .= " -col";}
    $execute .= " $outfiles/SM.out " .$opts{'o'};
    if($opts{'c'} eq "on"){
	if($antalSekvenser > 2){
	    system $execute;
	    system "mv .fold_out/".$opts{'o'}.".original.out .fold_out/".$opts{'o'}.".".$prettyCount.".original.out";
	    system "mv .fold_out/".$opts{'o'}.".refined.out .fold_out/".$opts{'o'}.".".$prettyCount.".refined.out";
	    system "mv .fold_out/".$opts{'o'}.".prob .fold_out/".$opts{'o'}.".".$prettyCount.".prob";
	    if($opts{'l'} eq "on"){
		system "mv .fold_out/".$opts{'o'}.".txt .fold_out/".$opts{'o'}.".".$prettyCount.".txt";
		system "mv .fold_out/".$opts{'o'}.".col .fold_out/".$opts{'o'}.".".$prettyCount.".col";
	    }
	}
	$prettyCount++;
    }else{
	#print $execute;
	system $execute;
    }
}
