#!/usr/bin/perl



package createData;


use FindBin;
use lib "$FindBin::Bin/metaRNAmodules_mapping/";
use getGappedPositions;
use strict;
use warnings;
use Data::Dumper;




## input is the path to a single line Stockholm file



my $gcsscons = "\#=GC SS_cons";
my $gcrf = "\#=GC RF";




sub modifyAli {

	my ($modStkAliPath, $fam, $outDir, $moHashRef, $testFiName) = @_;
	print STDOUT "\#\# Create data set\n";
	
	my ($seqHashRef, $idArrRef, $secSeq, $refSeq, $maxlen) = readStkFile($modStkAliPath);
	my ($newidArrRef, $testSeqHashRef) = check4Gaps($fam, $moHashRef, $modStkAliPath, $seqHashRef, $idArrRef);
	
	printModifiedCleanedAli($fam, $outDir, $newidArrRef, $secSeq, $refSeq, $testSeqHashRef, $maxlen, $testFiName);
}




sub check4Gaps {

	my ($fa, $moHashRef, $modStkAliPath, $seqHashRef, $idArrRef) = @_;
	my %posHash;
	my @newidArr = ();
	my %test;
	
	my $mug11 = $$moHashRef{"ungapSeedAliPumo1RfamStart"};
	my $mug12 = $$moHashRef{"ungapSeedAliPumo1RfamStop"};
	my $mug21 = $$moHashRef{"ungapSeedAliPumo2RfamStart"};
	my $mug22 = $$moHashRef{"ungapSeedAliPumo2RfamStop"};
	my $dobra = $$moHashRef{"dotBracket"};
	my $motifSeqs = $$moHashRef{"motifSeqs"};
	my ($mo1, $mo2) = split(/\&/, $motifSeqs);
	
	my $len1 = length($mo1);
	my $do1 = substr($dobra, 0, $len1);
	my $do2 = substr($dobra, $len1+1);
	my @do1Arr = split(//, $do1);
	my @do2Arr = split(//, $do2);
	
	my ($fullSeqId) = grep { $_ =~ /full/ } keys %$seqHashRef;
	my $fullGapSeq = $$seqHashRef{$fullSeqId};
	
	my $gapHashRef = getGappedPositions::getGappedPosHash($fullGapSeq);
	
	my $tmpmug11 = $mug11;
	my $tmpmug21 = $mug21;
	for(my $p = 0; $p < scalar @do1Arr; $p++){
		if($do1Arr[$p] =~ /[\(\{\[\<3]{1}/){
			my $gaPo1 = $$gapHashRef{$tmpmug11};
			$posHash{$gaPo1} = 1;
			$tmpmug11++;
		}
		else{ $tmpmug11++; }
	}
	
	for(my $pp = 0; $pp < scalar @do2Arr; $pp++){
		if($do2Arr[$pp] =~ /[\)\}\]\>3]{1}/){
			my $gaPo2 = $$gapHashRef{$tmpmug21};
			$posHash{$gaPo2} = 1;
			$tmpmug21++;
		}
		else{ $tmpmug21++; }
	}
	
	my $c = 0;
	foreach my $id(@$idArrRef){
		my $seq = $$seqHashRef{$id};
		my $verFlag = "TRUE";
		$verFlag = verifySequence($seq, \%posHash);
		if($verFlag eq "TRUE"){
			$test{$id} = $seq;
			push(@newidArr, $id);
			$c++;
		}
	}
	return(\@newidArr, \%test);
}





sub verifySequence {

	my ($tmpSeq, $pairedPosHashRef) = @_;
	my $flag = "TRUE";
	
	foreach my $pos(sort { $a <=> $b } keys %$pairedPosHashRef){
		my $let = substr($tmpSeq, ($pos - 1), 1);
		if(($let =~ /[\.\-]{1}/)){
			$flag = "FALSE";
		}
	}
	return($flag);
}








sub printModifiedCleanedAli {

	my ($fam, $outDir, $newidArrRef, $secSeq, $refSeq, $testSeqHashRef, $maxlen, $testFiName) = @_;
	my @newidArr = @$newidArrRef;
	my @cArr = ();
	
	while(length($gcsscons) < ($maxlen+3)){ $gcsscons .= " "; }
	while(length($gcrf) < ($maxlen+3)){ $gcrf .= " "; }
	
	## CREATE and PRINT DATA FILE
	if(!-d $outDir){ `mkdir $outDir`; }
	open(FOUT, ">$outDir"."$fam"."$testFiName") or die "Can't open $outDir$fam$testFiName in createData\n\n";
	print FOUT "\# STOCKHOLM 1.0\n\n";
	
	foreach my $idi(@$newidArrRef){
		my $s = $$testSeqHashRef{$idi};
		while(length($idi) < ($maxlen+3)){ $idi .= " "; }
		print FOUT $idi.$s."\n";
	}

	print FOUT $gcsscons.$secSeq."\n";
	print FOUT $gcrf.$refSeq."\n";
	print FOUT "//\n";
	close(FOUT);
	my $zipfi = $outDir.$fam.$testFiName.".gz";
	my $uzipfi = $zipfi;
	$uzipfi =~ s/\.gz//g;
	if(-e $uzipfi && !-e $zipfi){
		`gzip -9 $uzipfi`;
	}
	elsif(-e $uzipfi && -e $zipfi){
		`rm -f $uzipfi`;
	}
}







sub readStkFile {
			
	my ($modStkAliPath) = @_;
	my %seqHash;
	my @idArr = ();
	my $secSeq = "";
	my $refSeq = "";
	my $maxlen = 0;
		
	open(AL, $modStkAliPath) or die "\nCan't open $modStkAliPath in createData\n\n";
	while(<AL>){
		chomp($_);
		if($_ =~ /^\#=GC[\s\t]+SS\_cons[\s\t]+(.+)/){ $secSeq .= $1; }
		elsif($_ =~ /^\#=GC[\s\t]+RF[\s\t]+(.+)/){ $refSeq .= $1; }                    
		elsif(($_ !~ /^\#[\s\t]+STOCKHOLM/) && ($_ !~ /^$/) && ($_ !~ /^\/\//) && ($_ !~ /^\#/)){
			my ($id, $seq) = split(/[\s\t]+/, $_);
			if(exists($seqHash{$id})){
				$seqHash{$id} .= $seq;
			}
			else{
				$seqHash{$id} = $seq;
				if(length($id) > $maxlen){
					$maxlen = length($id);
				}
				push(@idArr, $id);
			}
		}
	}
	close(AL);
	return(\%seqHash, \@idArr, $secSeq, $refSeq, $maxlen);
}	
	



sub extendNames {
	
	my ($idArrRef, $maxlen) = @_;		
	foreach my $idd(@$idArrRef){
		while(length($idd) < ($maxlen+3)){
			$idd .= " ";
		}
	}		
	while(length($gcsscons) < $maxlen+3){ $gcsscons .= " "; }
	while(length($gcrf) < $maxlen+3){ $gcrf .= " "; }
	return($idArrRef);
}


1;


