#!/usr/bin/perl -w

# script to extract the consensus secondary structure for each Rfam family and compute the bp proportion

use strict;

open(IN,"zcat $ARGV[0] |"); # input Rfam.seed.gz file

$/="//\n";

print "RfamID\tannotation\tnbp\tlen\tbpper\trnashape\n";
while(<IN>)
{
  my $tmp=$_;
  #print $tmp,"------------\n";
   my @ss=split("\n",$tmp);
   my $flag=1;
   my @list;
   my ($id,$ac,$seqcnt,$seqlen,$ss, $ann);
   foreach my $d (@ss)
   {
     if($d=~/^#=GF.*AC.*RF/){ my @s1=split(/\s+/,$d);$ac=$s1[2];}
     if($d=~/^#=GC SS_cons/){$ss.=(split(/\s+/, $d))[2];}
     if($d=~/^#=GC RF/){$ann.=(split(/\s+/, $d))[2];}
   }
   if(length($ann)==0){next;}
   my $oss=$ss;
   my $gaps=$ann=~s/\./\./g;

   # calculate percentage of base pairs
   my $len=length($ss);
   # count the number of positions involved in base pairs ((), {}, [], <>)
   my $nbp=$ss=~s/\</\</g;
   $nbp+=$ss=~s/\(/\(/g;
   $nbp+=$ss=~s/\{/\{/g;
   $nbp+=$ss=~s/\[/\[/g;
   $nbp+=$nbp;
   my $bpper=0; 
   if($nbp>0){$bpper=$nbp/($len-$gaps);}

   # get RNAshape abstract
   $ss=~s/\</\(/g;
   $ss=~s/\>/\)/g;
   $ss=~s/\{/\(/g;
   $ss=~s/\}/\)/g;
   $ss=~s/\[/\(/g;
   $ss=~s/\]/\)/g;
   $ss=~s/[A-Z]/\./ig;
   $ss=~s/\_/\./ig;
   $ss=~s/\-/\./ig;
   $ss=~s/\:/\./ig;
   $ss=~s/\,/\./ig;

   # get RNAshape
   my $shape=`RNAshapes -t 5 -D '$ss'`;$shape=~s/\s+//g;

   print $ac,"\t",$ss,"\t",$nbp,"\t",($len-$gaps),"\t",$bpper,"\t",$shape,"\n";
}

