#!/usr/bin/perl -sw

###############################################################################
#                                                                             #
# Written by Jakob Hull Havgaard, 2004 - 2007, hull@genome.ku.dk              #
#                                                                             #
# Purpose: Remove overlapping hits from the LS (local score) lines in the     #
#         foldalign output, and estimate the significance of the hit. First   #
#         the best hit is selected. Then all hits overlapping this hit is     #
#         removed. Then the new best hit is selceted and overlapping hits are #
#         removed. From the list of hits the significance is estimated.       #
#         The program is a first hack attempt to do this, and would probaly do#
#         much better with a little work                                      #
#                                                                             #
#                                                                             #
###############################################################################

###############################################################################
#                                                                             #
#   Copyright 2004 - 2007 Jakob Hull Havgaard, hull@genome.ku.dk              #
#                                                                             #
#   This program is free software; you can redistribute it and/or modify      #
#   it under the terms of the GNU General Public License as published by      #
#   the Free Software Foundation; either version 2 of the License, or         #
#   (at your option) any later version.                                       #
#                                                                             #
#   This program is distributed in the hope that it will be useful,           #
#   but WITHOUT ANY WARRANTY; without even the implied warranty of            #
#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the             #
#   GNU General Public License for more details.                              #
#                                                                             #
#   You should have received a copy of the GNU General Public License         #
#   along with this program; if not, write to the Free Software               #
#   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA #
#                                                                             #
###############################################################################

#####################
# Handle the option #
#####################

my $big_neg = -20000;
my @level = qw(0);

my $cut=0;
if (defined $p) {$cut=$p;}
$p="";

if (defined $noboot) {$cut = -1;}
$noboot = "";

if ((defined $h) or (defined $help)) {&help($level[0]);}
$h=$help="";

my $clamset = 0;
my $clam = 1000;
my $ck = -1000;
if (defined $l) {
	$clam = $l;
	unless (defined $k) {die "When lambda is set (option -l) kappa must also be set (option -k)\n";}
	$ck = $k;
	$clamset = 1;
}
elsif (defined $k) {die "When kappa is set (option -k) lambda must also be set (option -l)\n";}


my $min_rand = 100;
if (defined $m) {$min_rand = $m;}
$m="";

use strict;

while (<>) {
	my ($name1, $name2, $maxLen, $id, $length1, $length2);
	if (/^Unknown nucleotide/) {next;}
	if (/^; FOLDALIGN/) {&readhead(\$maxLen, \$name1, \$name2, \$id, \$length1, \$length2);}
	else {
		chomp;
		warn "Skipping unknown line $_\n";
		next;
	}

	my @pos_i; # Holds the ends of the hit starting at position i,k

	my %islandAve;
	my %islandScore2;
	my %islandCount;
	my @hits;
	my $hitsCount=0;
	# This array will hold a list of scores and i, k coordinates sorted after score
	my @scoreCords;
	# This array holds a reference to the score in scoreCords
	my @cords;

	# Init to zero
	foreach my $lev (@level) {$islandAve{$lev}=0; $islandCount{$lev}=0;}

	#############################################################################
	# Read the file.                                                            #
	# The array scoreCords hold the score and start coordinates and will be     #
	# sorted later                                                              #
	# The array holds a reference to the scoreCords position making it possible #
	# to easily find the position of an alignment in scoreCords.                #
	# The array pos_i holds the end coordinates                                 #
	#############################################################################
	while (<>) {
		if (/^; \*\*\*\*\*/) {&skipalignment(); last;}
		my ($junk, $i, $j, $k, $l, $s) = split(/\s+/);

		# Throw away alignments with scores below the smallest cutoff.
		if ($s < $level[0]) {next;}

		push(@scoreCords, [$s, $i, $k]);
		$cords[$i][$k] = \$scoreCords[-1];
		$pos_i[$i][$k] = [$j, $l];

	}

	#############################
	# Sort the scoreCords array #
	#############################

	@scoreCords = sort scoreNum @scoreCords;

	###########################################################################
	# Making the list of non-overlapping alignments                           #
	#                                                                         #
	# From scoreCords get the coordinates of best scoring alignment           #
	#                                                                         #
	# Check all potentially overlapping coordinates.                          #
	# If an alignment is found to overlap it is removed from scoreCords using #
	# the reference in the coords array                                       #
	#                                                                         #
	###########################################################################

	my ($max, $mi, $mj, $mk, $ml);
	my $hitNum = 0;
	my @bestscore;
	my $bestcount = 1;

	# Get the best it
	$max = &getNextScore(\@scoreCords, \$mi, \$mk, \$hitNum);
	while ($max >= $level[0]) { # Keep going until the next score is below the cutoff

		# Store the score
		&storeAve($max, \%islandAve, \%islandScore2, \%islandCount, \@level);

		($mj, $ml) = @{$pos_i[$mi][$mk]};

		# Store the hit for later processing
		$hits[$hitsCount] = ["$name1", "$name2", "$mi", "$mj", "$mk", "$ml", "$max", "$maxLen", "$bestcount"];
		$hitsCount++;
		###########################################
		# Delete the hit and all overlapping hits #
		###########################################
		# Make sure the i and k values are positive
		my $start_i = $mi -$maxLen;
		if ($start_i < 1) {$start_i=1;}
		my $start_k = $mk -$maxLen;
		if ($start_k < 1) {$start_k=1;}
		for(my $i = $start_i; $i <= $mj; $i++) {
			for(my $k = $start_k; $k <= $ml; $k++) {
				if (!defined $pos_i[$i][$k] || $pos_i[$i][$k][0]  < $mi) {next;}
				if ($pos_i[$i][$k][1] >= $mk) {
					# The alignment is "removed" from the scoreCord list by setting
					# it score to a very low value.
					$${$cords[$i][$k]}[0]= $big_neg;
					# Save a little time
					undef($pos_i[$i][$k]);
				}
			}
		}
		#########################
		# Time for the next hit #
		#########################
		$bestcount++;

		$max = &getNextScore(\@scoreCords, \$mi, \$mk, \$hitNum);

	}

	#######################################################################
	# Remove any significant alignments from the estimation of parameters #
	#######################################################################

	my $oldLambda=1000;
	my $oldK = 0;
	my $lambda = $oldLambda;
	my $k=0;

	my $lev = $level[0];

	unless ($clamset) {
		&bootStrap($cut, \@hits, $level[$lev], \$islandCount{$lev}, \$islandAve{$lev}, $length1, $length2, \$lambda, \$k);

		if ($lambda == 10000) {print STDERR  "The significans value is invalid\n"; $lambda=1; $k=0;}
		else {
			for(my $i=1; $i<=$#level; $i++) {

				$lev = $level[$i];

				$oldLambda = $lambda;
				$oldK = $k;
	
				&bootStrap($cut, \@hits, $level[$i], \$islandCount{$level[$i]}, \$islandAve{$level[$i]}, $length1, $length2, \$lambda, \$k);

				if ($lambda > $oldLambda or $islandCount{$level[$i]} < $min_rand) {
					$lambda = $oldLambda;
					$k = $oldK;
					$lev = $level[$i-1]; 
					last;
				}
			}
			
		}
	}
	else {
		$lambda = $clam;
		$k = $ck;
	}

	######################
	# Output the results #
	######################

	my $name_I = $hits[0][0];
	my $name_K = $hits[0][1];
	print "# lambda = $lambda K = $k C = $lev ";
	print "Island count = $islandCount{$lev} Island sum = $islandAve{$lev} Island sum^2 = $islandScore2{$lev} ";
	print "Significance cut = $cut Length $name_I = $length1 Length $name_K = $length2 ID: $id\n";
	foreach my $lin (@hits) {
		my ($name1, $name2, $mi, $mj, $mk, $ml, $max, $maxLen, $bestcount) = @{$lin};
		my $sig = &pval($max, $lambda, $k, $length1, $length2);
		my $z = &zscore($max, $islandAve{$lev}, $islandScore2{$lev}, $islandCount{$lev});
		printf "$name1 %5i %5i $name2 %5i %5i %5i %1.2f %1.3f %3i", $mi, $mj, $mk, $ml, $max, $z, $sig, $bestcount;
		if ($sig < $cut) {print " *";}
		print "\n";
	}
}
########## The program ends here. Only subroutines below ######################

sub zscore {
	my ($score, $sum, $sum2, $count) = @_;

	my $ave = $sum/$count;
	my $spr = sqrt($sum2/$count - $ave**2);
	if ($spr == 0) {
		if ($score > $ave) {return "inf";}
		elsif ($score < $ave) {return "-inf";}
		else {return 0;}
	}
	
	return ($score - $ave)/$spr;
}

sub bootStrap {

	# Using lambda and k remove all significant alignments, and reestimate
	# lambda and k remove all ....
	
	my ($cut, $hits, $lev, $count, $sum, $len1, $len2, $lambda, $k) = @_;
	
	# No counts no lambda
	if ($$count == 0) {$$lambda = 10000; return;}

	$$lambda = &estimateLambda($$sum, $lev, $$count);
	
	if ($$lambda == 10000) {return;}
	
	$$k = &estimateK($$count, $lev, $$lambda, $len1, $len2);
	
	# Start at the top of the list
	for(my $i = 0; $i <= $#{$hits}; ) {
	
		# Remove scores. New_i is how far down the hit list the algorithm has
		# reached.
		my $new_i = &bootCore($i, $cut, $hits, $lev, $count, $sum, $len1, $len2, $lambda, $k);

		# No new significant alignments -> lambda and k are correct it is all over.
		if ($new_i == $i) {last;}
		
		# No counts no lambda
		if ($$count == 0) {$$lambda = 10000; return;}

		# Get the new lambda and k.
		$$lambda = &estimateLambda($$sum, $lev, $$count);
	
		if ($$lambda == 10000) {return;}
	
		$$k = &estimateK($$count, $lev, $$lambda, $len1, $len2);
		
		$i = $new_i;
		
	}
}

sub bootCore {

	my ($start, $cut, $hits, $lev, $count, $sum, $len1, $len2, $lambda, $k) = @_;
	
	my $i = $start;
	for( ; $i <= $#{$hits}; $i++) {

		my $p = pval($$hits[$i][6], $$lambda, $$k, $len1, $len2);
		
		if ($p < $cut) {$$sum-=$$hits[$i][6]; $$count--;}
		else {last;}	

	}
	
	return $i;

}
	
sub estimateLambda {

	# $sum is the sum of all alignments scoring above $cut
	# $cut is the score cut off, any score below this is not included
	# $number is the number of alignments above cut off

	my ($sum, $cut, $number) = @_;

	my $mel = $sum/$number - $cut;
	
	if ($mel != 0) {
		return log(1 + 1/$mel);
	}
	else {
		print STDERR "The lambda value could not be estimated\n";
		return 10000;
	}
}

sub estimateK {

	# $sum is the sum of all alignments scoring above the cut off $cut
	# $cut is the score cut off (minimum score
	# $lambda is the extreme-value distribution lambda parameter
	# $len1 and $len2 are the sequence lengths
	
	my ($number, $cut, $lambda, $len1, $len2) = @_;

	if ($lambda == 10000) {return 10000;}
	return (($number*exp($lambda*$cut)/($len1*$len2)));
}

sub pval {

	my ($score, $lambda, $k, $len1, $len2) = @_;
	
	if ($lambda == 10000 or $k == 10000) {return 1;}
	
	return (1 - exp(-1*$k*$len1*$len2*exp(-1*$lambda*$score)));
}

sub storeAve {
	my  ($v, $ave, $score2, $count, $levels) = @_;
	foreach my $level (@{$levels}) {
		if ($v > $level) {
			$$ave{$level}+=$v;
			$$score2{$level}+=$v*$v;
			$$count{$level}++;
		}
		else {last;}
	}
}

############################################
# Skip the head of the output file         #
# but get the names, the id and the lambda #
############################################
sub readhead {
	my ($lambda, $name1, $name2, $id, $length1, $length2) = @_;
	$$name1=$$name2=$$id="";
	$$lambda=0;
	while (<>) {
		if (/^; ALIGNING            (\S.*) against (.*)/) {$$name1=$1; $$name2=$2;}
		if (/^; ALIGNMENT_ID        (\S.*)/) {$$id=$1;}
		if (/^; LENGTH_SEQUENCE_1   (\d+)/) {$$length1=$1;}
		if (/^; LENGTH_SEQUENCE_2   (\d+)/) {$$length2=$1;}
		if (/^; PARAMETER           max_length=(\d+)/) {$$lambda=$1;}
		if (/^; --------/) {last;}
	}
}

sub skipalignment {
	if (eof) {return;}
	while (<>) {
		if (/; \*\*\*\*\*/) {last;}
	}
	if (eof) {return;}
	while (<>) {
		if (/; \*\*\*\*\*/) {last;}
	}
}

sub getNextScore {

	my ($data, $mi, $mk, $pos) = @_;
	
	# Go through the data array and return the first score better than big_neg
	my $score = $big_neg;
	while ($score <= $big_neg and $#$data > -1) {
		my $ref = shift(@$data);
		($score, $$mi, $$mk) = @{ $ref };
	}
	if ($score <= $big_neg) {
		$$mi = -1;
		$$mk = -1;
	}
	return $score;
}

##########################################################
# Sort numerically after the first element in array ref. #
##########################################################
sub scoreNum { $$b[0] <=> $$a[0]; }


##############################
# Print a help text and exit #
##############################
sub help {
	my ($level) = @_;

	print "Usage:\n";
	print "locateHits [-p=<pval>] [-noboot] [-l=<Lambda> -k=<Kappa>] <FOLDALIGN file>\n";
	print "\n";
	print "This script makes a ranked list of all non-overlapping local alignments with a\n";
	print "score above $level. The significance is calculated for each alignment using\n";
	print "Z-score and P-value (based on the extreme value distribution). The parameters of\n";
	print "the Z-score distribution is estimated directly from the scores of all the\n";
	print "non-overlapping alignments. The parameters of the extreme value distribution are\n";
	print "either given as an option to the program or estimated from the scores of the\n";
	print "non-overlapping alignments using an interative scheme where alignments found to\n";
	print "be significant are removed from the estimate.\n";
	print "\n";
	print "The input FOLDALIGN file must have been produced using options -plot_score or\n";
	print "-no_backtrack\n";
	print "\n";
	print "The output format is first a status line starting with '#'. Then a line for each\n";
	print "of the non overlapping alignments. The line format is: Name_1 start_1 end_1\n";
	print "name_2 start_2 end_2 score Z-score p-val rank [*]\n";
	print "\n";
	print "Options:\n";
	print "-p=<pval> A significance cut off any alignment with a p-value below this cut off\n";
	print "   is considered to be a significant alignment and is not used to estimate the\n";
	print "   parameters of the extreme value distribution. These alignments are marked\n";
	print "   with a * after the rank\n";
	print "-noboot Do not use the iterative scheme to estimate the extreme value\n";
	print "   parameters.\n";
	print "-l=<Lambda> Sets the Lambda value of the extreme value distribution. If this\n";
	print "   option is used the option -k must also be used.\n";
	print "-k=<Kappa> Sets the Kappa value of the extreme value distribution. If this\n";
	print "   option is used the option -l must also be used.\n";


	exit;
}
