#!/usr/bin/perl


package clustalW;


use FindBin;
use lib "$FindBin::Bin/metaRNAmodules_mapping/";
use lib "$FindBin::Bin/lib/";
use utils;
use newClass;
use strict;
use warnings;
use Data::Dumper;






sub prepareClustalW {

	my ($strucTabHashRef, $moHashRef, $outDir, $infoHash) = @_;
	
	####################
	## print information
	print STDOUT "\#\# Run CLUSTAL W\n";
	
	my $org = $$strucTabHashRef{"org"};
	my $rfamEmblSeqPosStart = $$strucTabHashRef{"rfamEmblSeqPosStart"};
	my $rfamEmblSeqPosStop = $$strucTabHashRef{"rfamEmblSeqPosStop"};
	my $pdb = $$moHashRef{"pdb"};
	my $dobra = $$moHashRef{"dotBracket"};
	my $rfamUngapSeq = $$strucTabHashRef{"fullUngapSeq"};
	my $tmpDir = $$infoHash{"instTmpDir"};
	my $fullSeedAliPath = $$infoHash{"fullSeedAliPath"};
	my $clustalw = $$infoHash{"clustalw"};
	my $chI1 = $$moHashRef{"chainInner1"};
	my @fredSeqArr = grep { $_ =~ /fredSeq\_$chI1/ } keys %$moHashRef;
	my $fredSeq = $$moHashRef{$fredSeqArr[0]};
	
	my $tmpFasta = writeTmpFasta($fredSeq, $rfamUngapSeq, $org, $rfamEmblSeqPosStart, $rfamEmblSeqPosStop, $pdb, $chI1, $tmpDir);
	my $tmpAln = runClustalW($tmpFasta, $clustalw);
	
	## these are fred indeces of the gapped clustal alignment!!!!
	my ($hitFlag1, $hitFlag2) = parseClustalResult($tmpAln, $moHashRef);
	
	if(($hitFlag1 eq "T") && ($hitFlag2 eq "T")){

		my ($computedMotif1, $computedMotif2) = getMotifSubstring($rfamUngapSeq,$moHashRef);
		my $pumos = $$moHashRef{"motifSeqs"};
		my ($pumo1, $pumo2) = split(/\&/, $pumos);
		
		## filter for motifs which have a shift in the position (i.e. FR3D motif seq is different from sequence I computed)
		if(($pumo1 eq $computedMotif1) && ($pumo2 eq $computedMotif2)){
			
			my $computedMotifs = $computedMotif1."&".$computedMotif2;
			$$moHashRef{"computedMotifs"} = $computedMotifs;
			
			newClass::goAhead($moHashRef, $strucTabHashRef, $infoHash);
		}
	}
}








sub getMotifSubstring {

	my ($fullUngapSeq, $moHashRef) = @_;
	my $motif1 = "";
	my $motif2 = "";
	my $pos1 = $$moHashRef{"ungapSeedAliPumo1RfamStart"};
	my $len1 = $$moHashRef{"ungapSeedAliPumo1RfamStop"} - $pos1 + 1;
	my $pos2 = $$moHashRef{"ungapSeedAliPumo2RfamStart"};
	my $len2 = $$moHashRef{"ungapSeedAliPumo2RfamStop"} - $pos2 + 1;
	$motif1 = uc(substr($fullUngapSeq, ($pos1-1), $len1));
	$motif2 = uc(substr($fullUngapSeq, ($pos2-1), $len2));
	return($motif1, $motif2);
}








sub runClustalW {

	my ($tmpFasta, $clustalw) = @_;
	my $clustalCommand = $clustalw." -INFILE=$tmpFasta -OUTORDER=INPUT -SEQNOS=ON -TYPE=DNA";
	`$clustalCommand`;
	my $tmpDnd = "";
	($tmpDnd = $tmpFasta) =~ s/fasta/dnd/g;
	my $tmpAln = "";
	($tmpAln = $tmpFasta) =~ s/fasta/aln/g;
	return($tmpAln);
}







sub parseClustalResult {

	my ($tmpAln, $moHashRef) = @_;
	my $posOuter1 = $$moHashRef{"posOuter1"};
	my $posOuter2 = $$moHashRef{"posOuter2"};
	my $posInner1 = $$moHashRef{"posInner1"};
	my $posInner2 = $$moHashRef{"posInner2"};
	my $len1 = $posInner1 - $posOuter1 + 1;
	my $len2 = $posOuter2 - $posInner2 + 1;
	my %mapHash;
	my $fredSeq = "";
	my $rfamSeq = "";
	my @pathArr = split(/\//, $tmpAln);
	my $namePath = $pathArr[3];
	my ($org) = split(/\_/, $namePath);	
	my @fredArr;
	my @rfamArr;
	
	open(CL, "<$tmpAln") or die;
	while(<CL>){
		if($_ =~ /^fr3dSeq.+[\s\t]+([\w\-]+)[\s\t]{1}\d+/){
			$fredSeq .= $1;
		}
		elsif($_ =~ /^fr3dSeq.+[\s\t]+([\w\-]+)[\s\t]{1}/){
			$fredSeq .= $1;
		}
		elsif($_ =~ /^$org.+[\s\t]+([\-\w]+)[\s\t]{1}\d+/){
			$rfamSeq .= $1;
		}
		elsif($_ =~ /^$org.+[\s\t]+([\-\w]+)[\s\t]{1}/){
			$rfamSeq .= $1;
		}
	}
	close(CL);
	
	my $idxMo1Sta = getPos($posOuter1, $fredSeq);
	my $idxMo1Sto = getPos($posInner1, $fredSeq);
	my $idxMo2Sta = getPos($posInner2, $fredSeq);
	my $idxMo2Sto = getPos($posOuter2, $fredSeq);
	
	my ($idxRfamMo1Sta, $gapC1) = getMapPos($idxMo1Sta, $rfamSeq);
	my ($idxRfamMo1Sto, $gapC2) = getMapPos($idxMo1Sto, $rfamSeq);
	my ($idxRfamMo2Sta, $gapC3) = getMapPos($idxMo2Sta, $rfamSeq);
	my ($idxRfamMo2Sto, $gapC4) = getMapPos($idxMo2Sto, $rfamSeq);
	my $hitFlag1 = "F";
	my $hitFlag2 = "F";
	for(my $q = $idxRfamMo1Sta; $q <= $idxRfamMo1Sto; $q++){
		if(substr($rfamSeq,($q-1+$gapC1),1) eq "-"){
			$hitFlag1 = "F";
		}
		else{ $hitFlag1 = "T"; }
	}
	for(my $q = $idxRfamMo2Sta; $q <= $idxRfamMo2Sto; $q++){
		if(substr($rfamSeq,($q-1+$gapC3),1) eq "-"){
			$hitFlag2 = "F";
		}
		else{ $hitFlag2 = "T"; }
	}
	if(($hitFlag1 eq "T") && ($hitFlag2 eq "T")){
		$$moHashRef{"ungapSeedAliPumo1RfamStart"} = $idxRfamMo1Sta;
		$$moHashRef{"ungapSeedAliPumo1RfamStop"} = $idxRfamMo1Sto;
		$$moHashRef{"ungapSeedAliPumo2RfamStart"} = $idxRfamMo2Sta;
		$$moHashRef{"ungapSeedAliPumo2RfamStop"} = $idxRfamMo2Sto;
	}
	##$idxMo1Sta, $idxMo1Sto, $idxMo2Sta, $idxMo2Sto, $idxRfamMo1Sta, $idxRfamMo1Sto, $idxRfamMo2Sta, $idxRfamMo2Sto
	return($hitFlag1, $hitFlag2);
}


sub getPos {
	
	my $pos = shift;
	my $se = shift;
	my @seArr = split(//, $se);
	my $gapC = 0;
	my $letC = 0;
	my $fc = 0;
	my $idx = 0;
	while($letC <= ($pos-1)){
		if($seArr[$fc] eq "-"){
			$gapC++;
		}
		else{ $letC++; }
		$fc++;
	}
	$idx = $gapC + $letC;
	return($idx);
}




sub getMapPos {
	
	my $pos = shift;
	my $se = shift;
	my @seArr = split(//, $se);
	my $letC = 0;
	my $gapC = 0;
	my $fc = 0;
	my $idx = 0;
	while($fc <= ($pos-1)){
		if($seArr[$fc] =~ /\w{1}/){
			$letC++;
		}
		else{ $gapC++; }
		$fc++;
	}
	$idx = $letC;
	return($idx, $gapC);
}




sub writeTmpFasta {

	my ($fredSeq, $rfamUngapSeq, $org, $rfamEmblSeqPosStart, $rfamEmblSeqPosStop, $pdb, $chI1, $randout) = @_;
	my $range = 10000;
	my $random_number = int(rand($range));
	my $tmpFasta = $randout.$org."_".$rfamEmblSeqPosStart."_".$rfamEmblSeqPosStop."_".$pdb.".fasta";
	
	unless(-e $tmpFasta){
		open(FA, ">$tmpFasta") or die "\nCan't open tmpFasta in clustalW\n\n";
		print FA ">fr3dSeq_".$chI1."_".$pdb."\n";
		print FA $fredSeq."\n";
		print FA ">".$org."_".$rfamEmblSeqPosStart."_".$rfamEmblSeqPosStop."_ungapFullRfamSeq\n";
		print FA $rfamUngapSeq."\n";
		close(FA);
	}
	return($tmpFasta);
}

1;
