#!/usr/bin/perl -w

package prepareModelBuilding;

use strict;
use warnings;
use Data::Dumper;





my $refSeq = "";
my $preparedFlag = "FALSE";
my $consStr = "\#=GC SS_cons";
my $rfStr = "\#=GC RF";
my $refStr = "REF_SEQ";
my $stockHead = "\# STOCKHOLM 1.0\n";
my %pairHash;




sub prepareFiles4Modelling {

	my ($fam, $newOutDir, $tmpStoFi, $newAliFi, $g11, $g12, $g21, $g22, $gapSeq, $newDefFi, $modelName, $version, $bps) = @_;
	
	$refSeq = "";	
	if(-e $tmpStoFi){
		
		$refSeq = getRefSeq($g11, $g12, $g21, $g22, $gapSeq);
		if($refSeq ne ""){
			formatTmpAli($fam, $refSeq, $tmpStoFi, $newAliFi);
			createDefinitionFile($fam, $bps, $gapSeq, $newDefFi, $modelName, $version, $newAliFi, $g11, $g12, $g21, $g22);
			
		}
	}
}









sub createDefinitionFile {

	my ($fam, $bps, $gapSeq, $newDefFi, $modelName, $version, $newAliFi, $g11, $g12, $g21, $g22) = @_;
	my $refSeq1 = substr($gapSeq, $g11-1, ($g12-$g11+1));
	my $refSeq2 = substr($gapSeq, $g21-1, ($g22-$g21+1));
	$refSeq1 =~ s/\-/\./g;
	$refSeq1 =~ s/\-/\./g;
	$refSeq1 = uc($refSeq1);
	$refSeq2 = uc($refSeq2);
	my $refSeqs = $refSeq1.$refSeq2;
	
	my $len1 = length($refSeq1);
	my $len2 = length($refSeq2);
	my $pattern1 = "";
	my $pattern2 = "";
	while(length($pattern1) < $len1){ $pattern1 .= "*"; }
	while(length($pattern2) < $len2){ $pattern2 .= "*"; }
	my $defStr = "";
	$defStr .= "NAME\t$modelName\t$version\n\n";
	$defStr .= "REF_SEQS\t$refSeq1\t$refSeq2\n\n";
	$defStr .= "DATA_SOURCE\t$pattern1\t$pattern2\t$newAliFi\n\n";
	
	my $nodeHash = getNodes($fam, $bps, $gapSeq, $g11, $g12, $g21, $g22, $len1, $len2);
	foreach my $node(sort { $a <=> $b } keys %$nodeHash){
		my $chNoBpStr = $$nodeHash{$node};
		my ($ch, $no, $bp) = split(/\*/, $chNoBpStr);
		$defStr .= "NODE\t".$node."\t$ch\t$no\t$bp\n";
	}
	$defStr .= "\n\n\n\n\n\n";
	my $mustHash = getMustPairStrings($nodeHash, $refSeqs);
	
	if(scalar(keys %$mustHash) > 0){
		 
		foreach my $p(sort keys %$mustHash){
			my $pList = $$mustHash{$p};
			foreach my $pai(@$pList){
				$defStr .= "PAIRS\t$p\t$pai\tMUST\n";
			}
		}
	}
	#my $canHash = getCanPairStrings($nodeHash);
	#if(scalar(keys %$canHash) > 0){
	#	foreach my $pp(sort keys %$canHash){
	#		my ($firstBas, $secBas) = split(/\_/, $pp);		
	#		$defStr .= "PAIRS\t$firstBas\t$secBas\tCAN\n";
	#	}
	#}
	$defStr .= "\n";
	my $noHash = getNoPairingString($len1+$len2);
	if(scalar(keys %$noHash) > 0){
		$defStr .= "NO_PAIRING";
		foreach my $np(sort { $a <=> $b } keys %$noHash){
			$defStr .= "\t$np";
		}
	}
	#$defStr .= "\n\n";
	#$defStr .= "ORDER";
	#my $sortList = getTopologicalOrder($nodeHash);
	#foreach my $sortItem(@$sortList){
	#	$defStr .= "\t$sortItem";
	#}
	$defStr .= "\n\n";
	$defStr .= "SEPARATION      4       0\n\n";
	writeDefFile($defStr, $newDefFi);
}







sub getCanPairStrings {

	my ($nodeHash) = @_;
	my $canStr = "";
	my %tmpHash;
	
	## get CAN_PAIRING string
	foreach my $node(sort { $a <=> $b } keys %$nodeHash){
		if(!exists($pairHash{$node})){
			my $str = $$nodeHash{$node};
			my ($c, $n, $b) = split(/\*/, $str);
			if($b ne "-"){
				if($b =~ /\,/g){
					my @pairings = split(/\,/, $b);
					foreach my $pair(reverse(@pairings)){
						$tmpHash{$pair."_".$node} = 0;
						$pairHash{$node} = 0;
						$pairHash{$pair} = 0;
					}
				}
				else{
					$tmpHash{$b."_".$node} = 0;
					$pairHash{$node} = 0;
					$pairHash{$b} = 0;
				}
			}
		}
	}
	return(\%tmpHash);
}





sub getNoPairingString {

	my $motifLen = shift;
	my %noHash;
	for(my $d = 0; $d < $motifLen; $d++){
		if(!exists($pairHash{$d})){
			$noHash{$d} = 0;
		}
	}
	return(\%noHash);
}





sub getMustPairStrings {

	my ($nodeHash, $refSeqs) = @_;
	## get first and last elements of the stem that have(!) to pair (cWW)
	my @sortArr = sort { $a <=> $b } keys %$nodeHash;
	my @bpArr;
	my %bpHash;
	
	foreach my $ndx(@sortArr){

		my $str = $$nodeHash{$ndx};
		my ($c, $n, $bp) = split(/\*/, $str);
			
		if($bp ne "-"){
			my @pairings = split(/\,/, $bp);
			foreach my $pair(@pairings){
				$pairHash{$pair} = 0;
				$pairHash{$ndx} = 0;
				if(exists($bpHash{$pair})){
					my $oldArr = $bpHash{$pair};
					push(@$oldArr, $ndx);
					$bpHash{$pair} = $oldArr;
				}
				else{
					my @tmpArr = ($ndx);
					$bpHash{$pair} = \@tmpArr;
				}
			}
		}
	}
	return(\%bpHash);
}




sub getNodes {

	my ($fam, $bps, $gapSeq, $gsta1, $gsto1, $gsta2, $gsto2, $len1, $len2) = @_;
	my %nodeHash;
	my $nodeCount = 0;
	my $chainId = 0;
	my $ntCount = 0;
	my $nodeString = "";
	my $new = "-";
	my $conds = "";
	my $fl = "T";
	my $gapMo1 = uc(substr($gapSeq, $gsta1-1, $len1));
	my $gapMo2 = uc(substr($gapSeq, $gsta2-1, $len2));
	my $mots = $gapMo1.$gapMo2;
	## get node string for each position of first motif (gapped motif sequence from gapped alignment!)
	for(my $q = 0; $q < length($mots); $q++){
		my $currentBase = substr($mots, $q, 1);
		my $substringUptoCurrentBase = substr($mots, 0, ($q+1));
		my $dotsInSubstring = $substringUptoCurrentBase;
		if($dotsInSubstring =~ /\w/g){ $dotsInSubstring =~ s/\w+//g; }
		my $oldNdxCurrentBase = $q - length($dotsInSubstring);
		## reset variables
		if($q == length($gapMo1)){ $chainId = 1; $ntCount = 0; }
		## for motif 1 are no dst available, that's why we can set new="-"
		if($q < length($gapMo1)){
			$nodeHash{$q} = $chainId."*".$ntCount."*".$new;
		}
		else{
			if($currentBase =~ /[a-zA-Z]+/){
				## search in bps array from mapping output for targets, a.k.a conds or dst
				my @pairArr = split(/\+/, $bps);
				foreach my $pairs(@pairArr){
					my ($src, $dst) = split(/\*/, $pairs);
					my $nsrc = $src;
					if($dst == $oldNdxCurrentBase){
						for(my $r=0; $r <=$src; $r++){
							my $cuBa = substr($mots,$r,1);
							if($cuBa !~ /\w/gi){
								$nsrc = $nsrc + 1;
							}
						}
						if(exists($nodeHash{$q})){
							$nodeHash{$q} .= ",".$nsrc;
						}
						else{
							$nodeHash{$q} = $chainId."*".$ntCount."*".$nsrc;
						}
					}
				}
				if(!exists($nodeHash{$q})){ $nodeHash{$q} = $chainId."*".$ntCount."*".$new; }
			}
			else{ $nodeHash{$q} = $chainId."*".$ntCount."*".$new; }
		}
		$ntCount++;
	}
	return(\%nodeHash);
}





sub formatTmpAli {

	## re-format the alignment file
	## get a single line alignment for each sequence called .stk in the same order as the input alignment
	## insert a reference sequence with the motif sequence
	my ($fa, $refSeq, $tmpStoFi, $newAliFi) = @_;
	
	my %hash;
	my @arr = ();
	my $id = "";
	my $seq = "";
	my $rest = "";
	my $seqLen = 0;
	my $maxLenId = 0;
	
	
	## fetch the cleaned modified alignment sequences and ids, in same order as the original ali 
	if(-e $tmpStoFi){
		open(ALI, "<$tmpStoFi") or die "\nCan't open $tmpStoFi\n\n";
		while(<ALI>){
			next unless($_ !~ /^$/);
			chomp($_);
			## read alignment, fetch sequences
			if($_ !~ /^\#/){
				if($_ =~ /[\s\t]+/){
					($id, $seq) = split(/[\s\t]+/, $_);
					if(exists($hash{$id})){
						$hash{$id} .= $seq;
					}
					else{
						$hash{$id} = $seq;
						push(@arr, $id);
						if(length($id) > $maxLenId){
							$maxLenId = length($id);
						}
					}
				}
			}
			elsif($_ =~ /^\#=GC SS\_cons/){
				($id, $rest, $seq) = split(/[\s\t]+/, $_);
				if(exists($hash{$consStr})){
					$hash{$consStr} .= $seq;
				}
				else{
					$hash{$consStr} = $seq;
				}
			}
			elsif($_ =~ /^\#=GC RF/){
				($id, $rest, $seq) = split(/[\s\t]+/, $_);
				if(exists($hash{$consStr})){
					$hash{$rfStr} .= $seq;
				}
				else{
					$hash{$rfStr} = $seq;
				}
			}
		}
		close(ALI);
		
		## open new alignmnet file
		## print STOCKHOLM header
		open(NAL, ">$newAliFi") or die "\nCan't open $newAliFi\n\n";
		print NAL $stockHead;
		
		## extend ids to get same length for all of them
		## print id and seq in one line
		foreach my $seqId(@arr){
			my $tmpSeq = $hash{$seqId};
			while(length($seqId) < ($maxLenId+2)){
				$seqId .= " ";
			}
			print NAL $seqId.$tmpSeq."\n";
		}
		## extend length of ref string
		## print ref string and reference sequence in one line
		while(length($refStr) < ($maxLenId+2)){ $refStr .= " "; }
		$refSeq = uc($refSeq);
		print NAL $refStr.$refSeq."\n";

		## extend length of consensus string
		## print consensus secondary structure of alignment and consensus string in one line
		my $consSeq = $hash{$consStr};
		my $rfSeq = $hash{$rfStr};
		while(length($consStr) < ($maxLenId+2)){ $consStr .= " "; }
		print NAL $consStr.$consSeq."\n";
		
		## extend length of consensus sequence string
		## print consensus sequence string and consensus sequence in one line
		while(length($rfStr) < ($maxLenId+2)){ $rfStr .= " "; }
		print NAL $rfStr.$rfSeq."\n";
		print NAL "//\n";
		close(NAL);
	}
}




sub getRefSeq {

	my ($g11, $g12, $g21, $g22, $gapSeq) = @_;
	my %hash;
	my $seqLen = length($gapSeq);
	my $refS = "";
	my $len1 = $g12 - $g11 + 1;
	my $len2 = $g22 - $g21 + 1;
	my $mo1 = substr($gapSeq, ($g11-1), $len1);
	my $mo2 = substr($gapSeq, ($g21-1), $len2);
	$mo1 =~ s/\-/\./g;
	$mo2 =~ s/\-/\./g;
	while(length($refS) < ($g11-1)){ $refS .= "."; }
	$refS .= uc($mo1);
	for(my $i = 1; $i < ($g21-$g12); $i++){ $refS .= "."; };
	$refS .= uc($mo2);
	while(length($refS) < $seqLen){ $refS .= "."; };
	return($refS);
}



sub writeDefFile {

	my ($defStr, $newDefFi) = @_;
	open(DEF, ">$newDefFi") or die "\nCan't open $newDefFi\n\n";
	print DEF $defStr;
	close(DEF);
}

1;
