#!/usr/bin/perl -w



package createTrainingAndTestData;


use lib "./metaRNAmodules_mapping/";
use getGappedPositions;
use strict;
use warnings;
#use POSIX;
use Data::Dumper;




## input is the path to a single line Stockholm file

my $gcsscons = "\#=GC SS_cons";
my $gcrf = "\#=GC RF";




sub splitData {

	my ($modStkAliPath, $fam, $outDir, $moHashRef, $div) = @_;
	
	print STDOUT "\n==========================\n";
	print STDOUT "CREATE DATA SET\n\n";
	my ($seqHashRef, $idArrRef, $secSeq, $refSeq, $maxlen) = readStkFile($modStkAliPath);
	my ($newidArrRef, $testSeqHashRef) = check4Gaps($fam, $moHashRef, $modStkAliPath, $seqHashRef, $idArrRef);
	splitDataset($fam, $outDir, $newidArrRef, $secSeq, $refSeq, $testSeqHashRef, $maxlen, $div);
}




sub check4Gaps {

	my ($fa, $moHashRef, $modStkAliPath, $seqHashRef, $idArrRef) = @_;
	my %posHash;
	my @newidArr = ();
	my %test;
	my $mug11 = $$moHashRef{"ungapSeedAliPumo1RfamStart"};
	my $mug12 = $$moHashRef{"ungapSeedAliPumo1RfamStop"};
	my $mug21 = $$moHashRef{"ungapSeedAliPumo2RfamStart"};
	my $mug22 = $$moHashRef{"ungapSeedAliPumo2RfamStop"};
	my $dobra = $$moHashRef{"dotBracket"};
	my $motifSeqs = $$moHashRef{"motifSeqs"};
	my ($mo1, $mo2) = split(/\&/, $motifSeqs);
	my $len1 = length($mo1);
	my $do1 = substr($dobra, 0, $len1);
	my $do2 = substr($dobra, $len1+1);
	my @do1Arr = split(//, $do1);
	my @do2Arr = split(//, $do2);
	my ($fullSeqId) = grep { $_ =~ /full/ } keys %$seqHashRef;
	my $fullGapSeq = $$seqHashRef{$fullSeqId};
	my $gapHashRef = getGappedPositions::getGappedPosHash($fullGapSeq);
	my $tmpmug11 = $mug11;
	my $tmpmug21 = $mug21;
	for(my $p = 0; $p < scalar @do1Arr; $p++){
		if($do1Arr[$p] =~ /[\(\{\[\<3]{1}/){
			my $gaPo1 = $$gapHashRef{$tmpmug11};
			$posHash{$gaPo1} = 1;
			$tmpmug11++;
		}
		else{ $tmpmug11++; }
	}
	
	for(my $pp = 0; $pp < scalar @do2Arr; $pp++){
		if($do2Arr[$pp] =~ /[\)\}\]\>3]{1}/){
			my $gaPo2 = $$gapHashRef{$tmpmug21};
			$posHash{$gaPo2} = 1;
			$tmpmug21++;
		}
		else{ $tmpmug21++; }
	}
	my $c = 0;
	foreach my $id(@$idArrRef){
		my $seq = $$seqHashRef{$id};
		my $verFlag = "TRUE";
		$verFlag = verifySequence($seq, \%posHash);
		if($verFlag eq "TRUE"){
			$test{$id} = $seq;
			push(@newidArr, $id);
			$c++;
		}
	}
	return(\@newidArr, \%test);
}





sub verifySequence {

	my ($tmpSeq, $pairedPosHashRef) = @_;
	my $flag = "TRUE";
	my $test = "";
	foreach my $pos(sort { $a <=> $b } keys %$pairedPosHashRef){
		my $let = substr($tmpSeq, ($pos - 1), 1);
		$test .= $let;
		if(($let =~ /[\.\-]{1}/)){
			$flag = "FALSE";
		}
	}
	return($flag);
}








sub splitDataset {

	my ($fam, $outDir, $newidArrRef, $secSeq, $refSeq, $testSeqHashRef, $maxlen, $div) = @_;
	my @newidArr = @$newidArrRef;
	my @cArr = ();
	my $numDir = "";
	my $num = 0;
	#my $div = 5; ## number of x-fold cross validation
	while(length($gcsscons) < ($maxlen+3)){ $gcsscons .= " "; }
	while(length($gcrf) < ($maxlen+3)){ $gcrf .= " "; }
	$num = int(($#newidArr+1)/$div);
	for(0..$#newidArr) {
		$cArr[$_]=0;
	}
	## TEST data 1 - ($div -1)
	for(my $i=1;$i<$div;$i++) {
		$numDir = $outDir.$i."/";
		my $testFiName = "_test_".$i."_mod.stk";
		if(!-d $numDir){ `mkdir $numDir`; }
		open(FOUT, ">$numDir"."$fam"."$testFiName");
		my $j=0;
		print FOUT "\# STOCKHOLM 1.0\n";
		while($j<$num) {
			my $random = int(rand($#newidArr));
			if($cArr[$random]==0) {
				my $s = $$testSeqHashRef{$newidArr[$random]};
				my $idi = $newidArr[$random];
				while(length($idi) < ($maxlen+3)){ $idi .= " "; }
				print FOUT $idi.$s."\n";
				$cArr[$random]=1;
				$j++;
			}
		}
		print FOUT $gcsscons.$secSeq."\n";
		print FOUT $gcrf.$refSeq."\n";
		print FOUT "//\n";
		close(FOUT);
	}
	## TEST data $div
	my $lastDir = $outDir.$div."/";
	my $lastFiName = "_test_".$div."_mod.stk";
	if(!-d $lastDir){ `mkdir $lastDir`; }
	open(FOU, ">$lastDir"."$fam"."$lastFiName");
	print FOU "\# STOCKHOLM 1.0\n";
	for(my $i=0;$i<=$#cArr;$i++) {
		if($cArr[$i]==0) {
			my $se = $$testSeqHashRef{$newidArr[$i]};
			my $idii = $newidArr[$i];
			while(length($idii) < ($maxlen+3)){ $idii .= " "; }
			print FOU $idii.$se."\n";
		}
	}
	print FOU $gcsscons.$secSeq."\n";
	print FOU $gcrf.$refSeq."\n";
	print FOU "//\n";
	close(FOU);


	## TRAINING data
	for(my $i=1;$i<=$div;$i++) {
		$numDir = $outDir.$i."/";
		my $trainFiName = "_train_".$i."_mod.stk";
		open(OUT, ">$numDir"."$fam"."$trainFiName");
		print OUT "\# STOCKHOLM 1.0\n";
		for(my $j=1;$j<=$div;$j++) {
			if($j!=$i) {
				$numDir = $outDir.$j."/";
				open(FIN, "<$numDir"."$fam"."_test_$j"."_mod.stk") or die "\nCan't open file\n\n";
				while(!eof(FIN)) {
					my $line = <FIN>;
					chop($line);
					if(($line !~ /^\#[\s\t]+STOCKHOLM/) && ($line !~ /^$/) && ($line !~ /\/\//) && ($line !~ /$gcrf/g) && ($line !~ /$gcsscons/g)){
						print OUT $line."\n";
					}
				}
				close(FIN);
			}
		}
		print OUT $gcsscons.$secSeq."\n";
		print OUT $gcrf.$refSeq."\n";
		print OUT "//\n";
		close(OUT);
	}
	
}







sub readStkFile {
			
	my ($modStkAliPath) = @_;
	my %seqHash;
	my @idArr = ();
	my $secSeq = "";
	my $refSeq = "";
	my $maxlen = 0;
	open(AL, $modStkAliPath) or die "\nCan't open $modStkAliPath\n\n";
	while(<AL>){
		chomp($_);
		if($_ =~ /^\#=GC[\s\t]+SS\_cons[\s\t]+(.+)/){ $secSeq .= $1; }
		elsif($_ =~ /^\#=GC[\s\t]+RF[\s\t]+(.+)/){ $refSeq .= $1; }                    
		elsif(($_ !~ /^\#[\s\t]+STOCKHOLM/) && ($_ !~ /^$/) && ($_ !~ /^\/\//) && ($_ !~ /^\#/)){
			my ($id, $seq) = split(/[\s\t]+/, $_);
			if(exists($seqHash{$id})){
				$seqHash{$id} .= $seq;
			}
			else{
				$seqHash{$id} = $seq;
				if(length($id) > $maxlen){
					$maxlen = length($id);
				}
				push(@idArr, $id);
			}
		}
	}
	close(AL);
	return(\%seqHash, \@idArr, $secSeq, $refSeq, $maxlen);
}	
	



sub extendNames {
	
	my ($idArrRef, $maxlen) = @_;		
	foreach my $idd(@$idArrRef){
		while(length($idd) < ($maxlen+3)){
			$idd .= " ";
		}
	}		
	while(length($gcsscons) < $maxlen+3){ $gcsscons .= " "; }
	while(length($gcrf) < $maxlen+3){ $gcrf .= " "; }
	return($idArrRef);
}


1;


