#!/usr/bin/perl


package getRfamStructureTables;



use FindBin;                 # locate this script
use lib "$FindBin::Bin/metaRNAmodules_mapping/";
use lib "$FindBin::Bin/lib/";
use pumo2fr3d;
use utils;
use strict;
use warnings;
use Data::Dumper;


## Package to return for a given PDBid the mapping positions from PDB seq to Rfam alignments if existent
## Input: PDBid
## Return: hash ref with fam, structure table entry (organism, PDBEmblPosition, PDBid, PDBRfamPosition),
## gapRfamSeq, gapRfamSeqLength, ungapRfamSeqLength, ungapRfamSeq, PDB chain, RfamEmblPosition, PDBid,
## PDBRfamStart, PDBRfamStop
## NOTE1: needs for each family which is linked to the PDBid an ungapped alignment file in fasta format.
## Furthermore, it needs a directory path to the structure tables from Rfam  foreach family in txt format
## and the rfam_keywords file to extract foreach family the corresponding PDBids (uses a bash script for that)
## written: 07/03/2012 - corinna









sub getRfamStructureLinks {
	
	
	my ($moHashRef, $infoHash) = @_;
	my $mofi = $$infoHash{"tmpClassFile"};
	my $fredDir = $$infoHash{"fr3dpath"};
	my $outDir = $$infoHash{"outdir"} ;
	my $cleanedRfamAliZipFi = $$infoHash{"cleanedRfamAliZipFi"};
	my $keywordsCall = $$infoHash{"keywordFile"};
	my $pdb = lc($$moHashRef{"pdb"});
	my $keyFile = $$infoHash{"keywordFile"};
	my $keywordsScript = $$infoHash{"keywordScript"};
	
	####################
	## print information
	print STDOUT "\#\# Get RFAM StructureTables\n";
	
	## get corresponding RFAM families, fills %famHash
	my $rootPath = $$infoHash{"rootPath"};
	
	my $famHashRef = getFamilies($pdb, $rootPath, $keyFile, $keywordsScript);
	
	## only proceed if there is a corresponding Rfam family
	if(scalar keys %$famHashRef > 0){
		
		##checks if a family belongs to the cleaned Rfam dataset (e.g. Rfam.seed.rr95.90.30.gz), fill  %cleanedFamHash
		my $cleanedFamHashRef = check4CleanedRfam($cleanedRfamAliZipFi, $famHashRef);
		
		## only proceed if there is a corresponding cleaned Rfam
		if(scalar keys %$cleanedFamHashRef > 0){
				
			## get structure table entries, proceed with pipeline
			getStructureTables($moHashRef, $infoHash, $cleanedFamHashRef);
		}
		else{
			print STDOUT "No cleaned RFAM families found\n";		
		}
	}
	else{
		print STDOUT "No RFAM families found\n";
	}	
}






sub check4CleanedRfam {
	
	my ($cleanedRfamAliZipFi, $famHashRef) = @_;
	my %cleanedFamHash;
	if(-e $cleanedRfamAliZipFi){
		my @cleanedFamArr = ();
		if($cleanedRfamAliZipFi =~ /\.gz$/){
			@cleanedFamArr = `zcat $cleanedRfamAliZipFi | grep "\#=GF AC   RF0"`;
		}
		else{
			@cleanedFamArr = `cat $cleanedRfamAliZipFi | grep "\#=GF AC   RF0"`;
		}
		foreach my $fa(keys %$famHashRef){
			if(grep(/$fa/, @cleanedFamArr)){
				$cleanedFamHash{$fa} = 0;
			}
		}
	}
	return(\%cleanedFamHash);
}






sub getFamilies {

	my ($pdb, $rootPath, $keyFile, $keywordsScript) = @_;
	my %famHash;
	my $keywords = "";
	my $keyFileUz = $keyFile;
	$keyFileUz =~ s/\.gz//g;
	if($keyFile =~ /\.gz$/){
		$keywords = `zcat '$keyFile'| awk -F"\\t" '\$5 ~ /[0-9a-z\\s]+/ { split(\$5, fam, /  +/); print \$2,fam[2] }' | awk '{ if ( length( \$2 ) > 0 ) { for(i=1; i<=NF; i++) {printf("%s ",\$i)} print \$NF } }'`;
	}
	else{
		$keywords = `cat '$keyFile'| awk -F"\\t" '\$5 ~ /[0-9a-z\\s]+/ { split(\$5, fam, /  +/); print \$2,fam[2] }' | awk '{ if ( length( \$2 ) > 0 ) { for(i=1; i<=NF; i++) {printf("%s ",\$i)} print \$NF } }'`;
	}
	my @fampdbArr = grep(/$pdb/,split(/\n/, $keywords));
	foreach my $fampdb(@fampdbArr){
		$famHash{(split(/\s/,$fampdb))[0]} = 0;
	}
	return(\%famHash);
}





sub getStructureTables {
	
	my ($moHashRef, $infoHash, $cleanedFamHashRef) = @_;
	my $structureTableDir = $$infoHash{"strucTabDir"};
	my $structureTableName = $$infoHash{"strucTabName"};
	my $pdb = lc($$moHashRef{"pdb"});
	my $tmpDir = $$infoHash{"instTmpDir"};
	
	foreach my $fam(keys %$cleanedFamHashRef){
		my $structureTableFi = $structureTableDir.$fam.$structureTableName;
		my $structureTableFiUz = prepareFile($structureTableFi, $tmpDir);
			
		open(MFP,"<".$structureTableFiUz) or die "\nCan't open $structureTableFiUz, no such file or directory in getRfamStructureTables\n\n";	
		while(<MFP>){
			
			next unless($_ =~ /$pdb/gi);
			chomp($_);
			
			if($_ =~ /([\w\d\.]+)[\s\t]+([\d\-]+)[\s\t]+$pdb[\s\t]+([\d\w]+)[\t]+([\d\s\-]+)/gi){
					
				my $o = "NA";
				my $p = 0;
				my $c = "NA";
				my $ps = "NA";
				my $pdbEmblSeqPosStart = 0;
				my $pdbEmblSeqPosStop = 0;
				my $pdbResStart = 0;
				my $pdbResStop = 0;
				my $pdbEmblStrand = "";
				my %strucTabEntryHash;
				if($1){ $o = $1; }
				if($2){
					$p = $2;
					my ($tmpPdbEmblSeqPosStart, $tmpPdbEmblSeqPosStop) = split(/-/, $p);
					if($tmpPdbEmblSeqPosStart <= $tmpPdbEmblSeqPosStop){
						$pdbEmblSeqPosStart = $tmpPdbEmblSeqPosStart;
						$pdbEmblSeqPosStop = $tmpPdbEmblSeqPosStop;
						$pdbEmblStrand = "+";
					}
					else{
						$pdbEmblSeqPosStart = $tmpPdbEmblSeqPosStop;
						$pdbEmblSeqPosStop = $tmpPdbEmblSeqPosStart;
						$pdbEmblStrand = "-";
					}
				}
				if($3){ $c = $3; }
				if($4){ $ps = $4; $ps =~ s/\s//g; ($pdbResStart, $pdbResStop) = split(/-/, $ps); }
				
				if($c ne "NA"){
					
					$strucTabEntryHash{"org"} = $o;
					$strucTabEntryHash{"pdbRfamChain"} = $c;
					$strucTabEntryHash{"pdbEmblSeqPosStart"} = $pdbEmblSeqPosStart;
					$strucTabEntryHash{"pdbEmblSeqPosStop"} = $pdbEmblSeqPosStop;
					$strucTabEntryHash{"pdbResStart"} = $pdbResStart;
					$strucTabEntryHash{"pdbResStop"} = $pdbResStop;
					$strucTabEntryHash{"fam"} = $fam;
					$strucTabEntryHash{"pdbEmblStrand"} = $pdbEmblStrand;
										
					## 2. filter: chain of structure table has to be same as the motif chains
					if(($c eq $$moHashRef{"chainOuter1"}) || ($c eq $$moHashRef{"chainOuter2"})){
						pumo2fr3d::mapPumo2Fr3d($moHashRef, $infoHash, \%strucTabEntryHash);
					}
				}
				else{
					print STDOUT "No structure table entry!\n";
				}
			}
		}
		close(MFP);
		#`rm -rf $tmpDir`;	
	}
}



sub prepareFile {
	
	my ($structureTableFi, $tmpDir) = @_;
	my @pathArr = split(/\//,$structureTableFi);
	my $structureTableFiName = pop(@pathArr);
	my $tmpStructureTableFi = $tmpDir.$structureTableFiName;
	my $tmpStructureTableFiUz = $tmpStructureTableFi;
	$tmpStructureTableFiUz =~ s/\.gz//g;
	if(-e $structureTableFi){
		if(! -e $tmpStructureTableFi && ! -e $tmpStructureTableFiUz){
			`cp $structureTableFi $tmpDir`;
			if($structureTableFi =~ /\.gz$/){
				`gzip -d $tmpStructureTableFi`;
			}
		}
		if(-e $tmpStructureTableFi && ! -e $tmpStructureTableFiUz){
				`gzip -d $tmpStructureTableFi`;
		}
	}
	return($tmpStructureTableFiUz, );
}


1;

