#!/usr/bin/perl -w


package prepare_stockholm_file;



use strict;
use warnings;
use Data::Dumper;




sub prepareStockholm {

	my ($singleLineAliHashRef, $gapSeq, $g11, $g12, $g21, $g22, $seqNum, $pairedGappedHashRef) = @_;
	
	####################
	## print information
	print STDOUT "\#\# Prepairing Stockholm file\n";
	
	my ($baseHashRef) = countBases($singleLineAliHashRef, $g11, $g12, $g21, $g22);
		
	## remove columns with less than 10% bases
	my $modifiedAlignmentHashRef = removeColumns($singleLineAliHashRef, $seqNum, $gapSeq, $baseHashRef, $pairedGappedHashRef);
	
	my $redHashRef = removeInvalidLines($modifiedAlignmentHashRef);
	return($redHashRef);
}

 

## removes lines from alignment with invalid characters that can't be handled by RMdetect
## does not remove full sequence or #=gc lines
sub removeInvalidLines {
	
	my ($modifiedAlignmentHashRef) = @_;
	my %redHash;
	my @gcSeqID = grep { $_ =~ /^\#/ } keys %$modifiedAlignmentHashRef;
	my @fullSeqID = grep { $_ =~ /^full/ } keys %$modifiedAlignmentHashRef;
	foreach my $seqID(keys %$modifiedAlignmentHashRef){
		if(($seqID !~ /^\#/) && ($seqID !~ /^full/)){
			my $seq = $$modifiedAlignmentHashRef{$seqID};
			my @arr = split(//, $seq);
			my $flag = "FALSE";
			foreach my $let(@arr){
				if($let !~ /[ACGUacgu\.\-\_\:\,\~\s]+/g){ $flag = "TRUE"; }
			}
			if($flag eq "FALSE"){
				$redHash{$seqID} = $seq;
			}
		}
	}
	foreach my $gc(@gcSeqID){ $redHash{$gc} = $$modifiedAlignmentHashRef{$gc}; }
	foreach my $fu(@fullSeqID){ $redHash{$fu} = $$modifiedAlignmentHashRef{$fu}; }
	return(\%redHash);
}






sub removeColumns {

	my ($singleLineAliHashRef, $seqNum, $gapSeq, $baseHashRef, $pairedGappedHashRef) = @_;
	my %newHash;
	
	## calculate percentage of bases in column, if perc < 10 delete column
	foreach my $orgi(keys %$singleLineAliHashRef){
		
		my $seq = $$singleLineAliHashRef{$orgi};
		my @seqArr = split(//, $seq);
		my $newSeq = "";
		foreach my $pos(sort {$b<=>$a} keys %$baseHashRef){
			my $perc = ($$baseHashRef{$pos} / $seqNum) * 100;
			if($perc < 10){
				if(! exists($$pairedGappedHashRef{$pos})){
					splice(@seqArr, ($pos - 1), 1);
				}
			}
		}
		for(my $l = 0; $l < scalar @seqArr; $l++){
				$newSeq .= $seqArr[$l];
		}
		$newHash{$orgi} = $newSeq;
	}
	return(\%newHash);
}





sub countBases {

	my ($singleLineAliHashRef, $g11, $g12, $g21, $g22) = @_;
	my %baseHash;
	
	## init hash for 2 motif regions each with '0'
	for(my $x = $g11; $x <= $g12; $x++){ $baseHash{$x} = 0; }
	for(my $z = $g21; $z <= $g22; $z++){ $baseHash{$z} = 0; }
		
	## count bases of each column in both motif regions
	foreach my $org(keys %$singleLineAliHashRef){
		if($org !~ /^\#/){
			
			my $tmpg11 = $g11;
			my $tmpg21 = $g21;
			my $seq = $$singleLineAliHashRef{$org};

			## motif 1
			my $mo1Reg = uc(substr($seq, ($g11-1), ($g12-$g11+1)));
			my @mo1Arr = split(//, $mo1Reg);
			for(my $y = 0; $y < scalar @mo1Arr; $y++){
				
				if($mo1Arr[$y] =~ /[A-Za-z]{1}/){
					$baseHash{$tmpg11} += 1;
				}
				$tmpg11++;
			}
			## motif 2
			my $mo2Reg = uc(substr($seq, ($g21-1), ($g22-$g21+1)));
			my @mo2Arr = split(//, $mo2Reg);
			for(my $yy = 0; $yy < scalar @mo2Arr; $yy++){
				
				if($mo2Arr[$yy] =~ /[A-Za-z]{1}/){
					$baseHash{$tmpg21} += 1;
				}
				$tmpg21++;
			}
		}
	}
	return(\%baseHash);
}


1;

