#!/usr/bin/perl -w

use strict;

# This script was written by Jakob Hull Havgaard, 2004
# Coloring was inactivated by Ebbe Sloth Andersen, February 2005.
# Pairing mask was changed to include "-" in L295, ESA, August 2005.

# The certainty is reported as -1 if no value is given. The minimum color is used
# The certainty is reported as -2 if it has the value nan The minimum color is used



#########################################################
# Setup of default variables

my $open="(";       # The base-pair open character
my $close=")";      # The base-pair close character
my $single=".";     # The non-pair character

my %data;           # Holds structure. Positions that are base-pair holds the
                    #  base-pair position. Non-base-paired positions holds -1
						  #  Positions starts at 1

my $prog_name = "addpmasknum6"; # The script name

# Define the colors. Format: Red value, green, blue.
my %colors = (
	"red"       => [1.00, 0.00, 0.00],
	"yellow"    => [1.00, 1.00, 0.00],
	"green"     => [0.00, 1.00, 0.00],
	"purple"    => [0.93, 0.51, 0.93],
	"blue"      => [0.00, 0.00, 1.00]
);

# Define when to use which color
# Format: cutoff, color. Colors are defined in the color hash
# The reliability has to be equal or greater than cutoff before the color is
# used.
# The first cutoff has to be 0.
my @color_scale = (
	[0.00, "red"],
	[0.01, "yellow"],
	[0.25, "green"],
	[0.50, "purple"],
	[0.75, "blue"]
);

# The allowed base-pairs
my %bp = (
	"au" => 1,
	"cg" => 1,
	"gc" => 1,
	"gu" => 1,
	"ua" => 1,
	"ug" => 1
);

# The printf formats used for each type of field
my %formats = (
	"label" => "%1s",
	"residue" => " %1s",
	"seqpos" => " %5s",
	"alignpos" => " %5d",
	"align_bp" => " %5s",
	"color_r"  => " %1.2f",
	"color_g"  => " %1.2f",
	"color_b"  => " %1.2f",
	"certainty" => " %1.4f",
	"UNKNOWN" => " %5s"
);

#######################################################
# Process the first part of the file

my @header;   # The entry header
my @body;     # The entry body
my @end;      # The entry post body lines

# Read the first line and skip until a TYPE RNA line is found
my $first_line;
while ((defined ($first_line = <>)) and ($first_line !~ /^; TYPE\s+RNA/)) {
	if ($first_line =~ /; TYPE\s+pairingmask/) {
		# Do not print old pairingmasks
		&dumpentry();
		next;
	}
	print $first_line;
}

# If no TYPE RNA was found exit the script
if (!defined $first_line) {exit;}

# Read and process the first RNA entry
&readEntry(\@header, \@body, \@end, $first_line);

my %head_fields;  # The fields of the header
my $formatLine = &parseHeader(\@header, \%head_fields, \%formats);

# Make the parring mask
&makeParingMask(\%head_fields, \@body, \%data, $open, $close, $single);

# Process the first entry
&printEntry(\@header);
&processEntry(\@body, \%head_fields, \%bp, $formatLine);
&printEntry(\@end);


######################################################
# Process the remaining entries

while (<>) {
	# Check the type of the entry
	my @fields = split /\s+/;
	&readEntry(\@header, \@body, \@end, $_);
	
	if ($fields[2] eq "RNA") {
		$formatLine = &parseHeader(\@header, \%head_fields, \%formats);
		&printEntry(\@header);
		&processEntry(\@body, \%head_fields, \%bp, $formatLine);
		&printEntry(\@end);
	}
	else {
		&skipAllEntry(\@header, \@body, \@end);
	}
}


################# Sub-routines #######################


########################
# Read a col file entry
# The first line is presumed to have been read earlier and is supplied as an 
# argument.
# The entry is split into three parts
# The header: return through the first argument, which must be an array reference
# The body (line): return through the seconed argument (array ref)
# The end: Return through the third argument (array ref)
sub readEntry {
	my ($head, $line, $end, $fl) = @_;   # The first three are references to arrays
	my ($hc, $lc, $ec) = (0, 0, 0); # Line counters

	# Read header
	
	# Store the first line of the header
	@$head = ();
	$$head[$hc] = $fl;
	$hc++;
	while (<>) {
		if (/^;/) {   # This is a header line
			$$head[$hc] = $_;
			$hc++;
		}
		else {last;}  # The header is done
	}
	
	# Read the sequences lines
	
	# Check and store the first line
	if (/^;/) {print STDERR "$prog_name: Warning! There is something wrong with the input file\n"; return -1;}
	@$line = ();
	$$line[$lc] = $_;
	$lc++;
	while (<>) {
		if (/^;/) {last;}  # End of the sequences lines
		$$line[$lc] = $_;
		$lc++;
	}
	
	# Read the post entry lines if any
	
	# Check and store the first line
	@$end = ();
	if (/^; \*\*\*\*\*/) { # This is what is expected
		$$end[$ec] = $_;
		$ec++;
		return 1;
	}
	elsif (!/^;/) { # Something might be missing
		print STDERR "$prog_name: Warning! The input file has an unexpected end. Something might be wrong.\n";
		return 0;
	}
	else {  # Read any extra post entry lines
		while (<>) {
			$$end[$ec] = $_;
			$ec++;
			if (/^; \*\*\*\*\*/) {return 1;}
		}
	}
	print STDERR "$prog_name: Warning! The input file has an unexpected end. Something might be wrong.\n";
	return 0;
}


#######################################
# Parse a col file header
sub parseHeader {
	my ($in, $out, $format) = @_;  
	# The first argument is an array with the header lines	
	# The second argument is a hash in which the information is returned
	# Column labels are stored with label -> number all other types of fields
	# are stored as "field_1 -> field_2"
	# Format is a hash which holds the printf formatting strings for all columns
	# The function returns a printf format for the output of the body lines
	
	%$out = (); # Delete the old hash
	my $out_format;
	for(my $i=0; $i<=$#$in; $i++) {
		my @field = split /\s+/, $$in[$i];
		if ($field[1] eq "COL") {
			$$out{$field[3]} = $field[2];
			if (defined $$format{$field[3]}) {$out_format .= $$format{$field[3]};}
			else {$out_format .= $$format{"UNKNOWN"};}
		}
		else {$$out{$field[1]} = $field[2];}
	}
	return "$out_format";
}

#######################################
# Parses an entry into a pairing mask and prints the pairing mask
# Open, Close and single are the characters used to define the structure

sub makeParingMask {
	my ($head_fields, $lines, $data, $open, $close, $single ) = @_;
	# Head_fields is the hash holding the header fields
	# Lines holds the lines of the entry
	# Data is the parsed structure (hash)
	
	# Get the column numbers for the structure
	my $pos = $$head_fields{"alignpos"};
	my $bpp = $$head_fields{"align_bp"};
	my $cer;
	# Get the column number for the certainty or reliability
	# cer == -1 is used to indicate that no field was found
	if (defined $$head_fields{"certainty"}) {$cer = $$head_fields{"certainty"};}
	elsif (defined $$head_fields{"reliability"}) {$cer = $$head_fields{"reliability"};}
	else {print STDERR "$prog_name: Warning no certainty or reliability field found the first RNA type entry. Colorcoding set to zero\n"; $cer = -1;}

	# Zero based index adjustment
	$pos--;
	$bpp--;
	$cer--;
	
	# The annotation string
	my $anno = "";
	
	# Array for storing the certainties and the colors
	my @certain;
	my @color;
	# Parse the annotation
	for(my $i = 0; $i<=$#$lines; $i++) {
		my @fields = split(/\s+/, $$lines[$i]);
		if ($fields[$bpp] eq $single) {$anno .= $single;}
		elsif ($fields[$pos] < $fields[$bpp]) {$anno .= $open;}
		elsif ($fields[$pos] > $fields[$bpp]) {$anno .= $close;}
		else {
			print STDERR "$prog_name: Warning unknown annotation found in input file. Annotating as single stranded\n";
			$anno .= $single;
		}
		# Assign color fields
		if ($cer == -2) {
			$certain[$i] = -1;
			$color[$i] = $color_scale[0][1];
		}
		else {
			$certain[$i] = $fields[$cer];
			if ($certain[$i] eq "nan") {
				$color[$i] = $color_scale[0][1];
				$certain[$i] = -2;
			}
			else {
				foreach my $cs (@color_scale) {
					if ($$cs[0] <= $certain[$i]) {$color[$i] = $$cs[1];}
					else {last;}
				}
			}
		}
	}
	
	# Parse the structure and store the new annotation
	my $num_anno = &parsestruc($anno, $data, $open, $close, $single);

	# Output the parringmask
	print "; TYPE              pairingmask\n";
	print "; COL 1             label\n";
	print "; COL 2             residue\n";
	print "; COL 3             alignpos\n";
 	print "; COL 4             certainty\n";
	print "; ENTRY             pairingmask\n";
	print "; ----------\n";
	for(my $i=0; $i<length($num_anno); $i++) {
		my $p = $i +1;
		print "M   ";
                if ( substr($num_anno, $i, 1) eq "." ) {
                    print "-";
                } else {
                    print substr($num_anno, $i, 1);
                }
#		printf " $formats{'color_r'}", $colors{$color[$i]}[0];
#		printf " $formats{'color_g'}", $colors{$color[$i]}[1];
#		printf " $formats{'color_b'}", $colors{$color[$i]}[2];
		print "    $p";
		printf " $formats{'certainty'}", $certain[$i];
		print "\n";
	}
   print "; **********\n";
}


############################################
# Prints an entry
# The nucleotides of allowed base-pairs are printed as capital letters
# all other nucleotides are printed as lower case letters
sub processEntry {
	my ($line, $head_fields, $bp, $format) = @_;
	# Line is the entry lines
	# Head_fields holds the header fields
	# Bp holds the allowed base-pairs
	# Format holds the printf format of the output
	
	# Get the column numbers
	my $pos = $$head_fields{"alignpos"};
	my $bpp = $$head_fields{"align_bp"};
	my $nuc = $$head_fields{"residue"};
	
	# Zero based index adjustment
	$pos--;
	$bpp--;
	$nuc--;
	
	# Go through the sequence
	for(my $i=0; $i<=$#$line; $i++) {
		my @field = split(/\s+/, $$line[$i]);
		$field[$nuc] = lc $field[$nuc];
		if ($field[$bpp] ne $single) {
			my $line_pos = $field[$bpp];  # Store the line number
			$line_pos--; # adjust for zero base index
			my @sec_field = split (/\s+/, $$line[$line_pos]);
			my $let = lc $sec_field[$nuc];
			if (defined $$bp{"$field[$nuc]$let"}) {
				$field[$nuc] = uc $field[$nuc];
			}
		}
		printf "$format\n", @field;
	}
}

##########################################
# Prints three arrays
sub skipAllEntry {
	my ($h, $b, $e) = @_;
	&printEntry($h);
	&printEntry($b);
	&printEntry($e);
}
sub dumpentry {
	while (<>) {
		if (/^; \*\*\*\*\*/) {last;}
	}
}
#####################################
# Print an array
sub printEntry {
	my ($lines) = @_;
	foreach my $line (@$lines) {
		print $line;
	}
}

#######################
# Parse the structure #
#######################

sub parsestruc {
# There three cases: base-pair open, close, and no base-pair each is handled
# separately

	my ($anno, $data, $open, $close, $singel) = @_; # The annotation and the structure
	my $end = length($anno);
	my $begin = 1;
	my @lager;   # A stack
	my $start; # The left side of a basepair
	my $n;
	my $countbp=0;
	my @stemStack; # Holds the positons where stems starts
	my $stemMax=0;
	my $prevState = 1; # 1 for open, 0 for close
	my $prevPos=0;     # Keep track of the last position
	for(my $i=$begin-1; $i<$end; $i++) {
		$n=$i+1;
		if (substr($anno,$i,1) eq $open) {
# Base-pair open
# Put the position on the lager stack so it will be pop when the right closing
# base-pair is reached.

# If the previous state was a close state then store that positions opening
# position in the stemStack. The new stemMax position is also set.
			if ($prevState == 0) {
				push(@stemStack, $$data{$prevPos});
				$stemMax = $$data{$prevPos};
			}
			push(@lager, $n); # If its the left part of a basepair put it in the stack
			$prevState = 1;
			$prevPos=$n;
		}
		elsif (substr($anno,$i,1) eq $close) {
# Base-pair close
# Get the start position is taken from the top of the lager stack and the base-
# pair is store in the data hash.

# The lager stack is empty then something is wrong
			if ($#lager < 0) {print STDERR "Could not parse stack. Underflow error\n";$start = 0;}
			else {
				$start = pop(@lager); # Get the left side from the stack
			}
# If the start position is less than the previous stacks starts then the current
# base-pair is from another stack
# Store the previous positons opening position as a stem start
# Go through all stem starts to find the next stemMax. The new stem start
# position must be small than the previous.
			if ($start < $stemMax) {
				push(@stemStack, $$data{$prevPos});
				$stemMax=0;
				foreach my $stem (@stemStack) {
					if (($stem < $start) && ($stem > $stemMax)) {$stemMax=$stem;}
				}
			}
			$$data{$start} = $n;  # Put the complementary positions into the structure
			$$data{$n} = $start;
			$countbp++;
			$prevState = 0;
			$prevPos = $n;
		}
		else {
# Non-basepair
			if (substr($anno, $i, 1) eq $singel) {
				$$data{$n} = -1; # No basepairing
			}
			else {
				print STDERR "Something wrong in the strucCC calculation\n";
			}
		}
	}
	if ($#lager != -1) {print STDERR "Could not parse stack. $#lager elements left in the stack\n";}
# Make the annotation line
	push(@stemStack, $$data{$prevPos});
	my $stemCount = 0;
	my $count =0;
	my $stemline = $single x $end;
	@stemStack = sort num @stemStack;
	push(@stemStack, -2);
	$stemMax = $stemStack[0];
	my $let = $stemCount;
	foreach my $pos (sort num keys %{$data}) {
		if ($pos < $$data{$pos}) {
			if ($pos == $stemMax) {
				$stemCount++;
				$count++;
				$let = $stemCount;
				if ($stemCount == 10) {$stemCount = 65;}
				if ($stemCount == 91) {$stemCount = 97;}
				if ($stemCount == 122) {print STDERR "Warning: To many stems lettering may be unreadable\n";}
				if ($stemCount > 10) {$let = chr($stemCount);}
				$stemMax = $stemStack[$count];
			}
			substr($stemline, $pos-1, 1) = $let;
			substr($stemline, $$data{$pos}-1, 1) = $let;
		}
	}
	return $stemline;
}

sub num {$a<=>$b;}
