#!/usr/bin/nawk -f

# align2symvec.awk version 1.0
#
# Jan Gorodkin
# Center for Biological Sequence Analysis
# The Technical University of Denmark
# B206, DK-2800 Lyngby
# Denmark
# gorodkin@cbs.dtu.dk

# example to execute
# ./align2symvec.awk -v Itype=2 -v sl=1 -v startpos=1 -v ntdistfile="ntdist1" -v pAU=1 -v pCG=1 -v pGU=0.8 datafile | more
# A file align.info will be generated. This file contains all the relevant
# information concerning information contents of the logo.


BEGIN{}
NR==1{
 CAU=pAU;
 CCG=pCG;
 CGU=pGU;

 k=1;

 structlogo = sl;
 if(structlogo!=0 && structlogo!=1) 
 { print "\nWARNING variable sl should be 0 or 1."; }

 if(Itype!=1 && Itype!=2)
 {
   Itype=2;
   print "\nWarning variable Itype should be 1 or 2.\n";
   print "Itype has been sat equal 2.\n";
 }

}
{
   if(NR==1 && $1==">")
   {
     seqlength = length($2);
     for(i=1;i<seqlength+1;i++)
     {
       struct[i] = substr($2,i,1);
     }
   }

   if(NR>1 && $1==">") 
   {
     seqlength = length($2);
     for(i=1;i<seqlength+1;i++)
     {
       seq[k,i]=substr($2,i,1);
     }

     k++;
   }
   

}
END{

   # Reading nt dist. for each position. Or use the previous line.
   # The latter means that if you only have one line in the ntdistfile
   # it corresponds to having only one backgorund dist for the whole
   # alignment.
   pos=1;
   while( (getline < ntdistfile) > 0 )
   {
      if(NF==4)
      {
        p[1,pos]=$1;
        p[2,pos]=$2;
        p[3,pos]=$3;
        p[4,pos]=$4;
        p[5,pos]=1;
        probline=$0;
      }

      #check if probs add to one
      tmp=0;
      for(j=1;j<5;j++) tmp+=int(10000.0*p[j,pos]);
      if(tmp!=10000)
      {
          print "Your a priori probabilities do not add to one!  (exit)";
          print tmp/10000"   "probline;
          exit;
      }
     pos++;
   }
   close(ntdistfile);

   if(pos<seqlength)
     for(i=pos;i<seqlength+1;i++)
     {
      p[1,i]=p[1,i-1];
      p[2,i]=p[2,i-1];
      p[3,i]=p[3,i-1];
      p[4,i]=p[4,i-1];
      p[5,i]=1;
     }


   # finding the sequence information of each position
   for(i=1;i<seqlength+1;i++)
   {      
      for(l=1;l<k;l++)
      {
         if(seq[l,i]=="A") q[1,i]++;
         else if(seq[l,i]=="C") q[2,i]++;
         else if(seq[l,i]=="G") q[3,i]++;
         else if(seq[l,i]=="U") q[4,i]++;
         else if(seq[l,i]=="-") q[5,i]++;
         else { print "not allowed symbol:  "seq[l,i];  exit; }
      }

      for(j=1;j<6;j++)
      {
         q[j,i] = q[j,i]/(k-1);
         if(q[j,i] > 0) IC[j,i] = q[j,i] * log ( q[j,i]/p[j,i] ) /log(2);
         else IC[j,i] = 0;
      }
     

      Ipos[i]=0;
      for(j=1;j<6;j++) Ipos[i] += IC[j,i];

      ICtot += Ipos[i];
   }



   print "pos     tot inf."  > "align.info";
   for(i=1;i<seqlength+1;i++)
   {
     for(j=seqlength;j>i;j--)
     {
       stmp=0;
       symbuse=0;
       if(struct[i]=="\(" && struct[j]=="\)") stmp=1;
       if(struct[i]=="\[" && struct[j]=="\]") stmp=1;
       if(struct[i]=="\<" && struct[j]=="\>") stmp=1;
       if(struct[i]=="\{" && struct[j]=="\}") stmp=1;
       if(struct[i]=="\(" || struct[i]=="\)") symbuse=1;
       if(struct[i]=="\[" || struct[i]=="\]") symbuse=1;
       if(struct[i]=="\{" || struct[i]=="\}") symbuse=1;
       if(struct[i]=="\<" || struct[i]=="\>") symbuse=1;
       if(symbuse==0 && struct[i]==struct[j]) stmp=1;

       if(stmp==1 && struct[i]!=".")
       {
         qij=0;
         qijAU=0; qijCG=0; qijGU=0;

         for(l=1;l<k;l++)
         {
           if(seq[l,i]=="A" && seq[l,j]=="U") qijAU++;
           else if(seq[l,i]=="U" && seq[l,j]=="A") qijAU++;
           if(seq[l,i]=="G" && seq[l,j]=="U") qijGU++;
           else if(seq[l,i]=="U" && seq[l,j]=="G") qijGU++;
           if(seq[l,i]=="G" && seq[l,j]=="C") qijCG++;
           else if(seq[l,i]=="C" && seq[l,j]=="G") qijCG++; 
         }
         
         qij = (qijAU*CAU + qijCG*CCG + qijGU*CGU)/(k-1);

         pij  =  q[1,i]*q[4,j]*CAU + q[2,i]*q[3,j]*CCG + q[3,i]*q[2,j]*CCG+q[3,i]*q[4,j]*CGU + q[4,i]*q[1,j]*CAU + q[4,i]*q[3,j]*CGU;
         #          A      U            C      G            G      C            G      U            U      A            U     G


         if(pij>0 && qij>0) Mi = qij * log( qij/pij)/log(2);
         else Mi = 0;
         if(pij>0 && (1-qij)>0) Mj = (1-qij) * log( (1-qij)/(1-pij))/log(2);
         else Mj = 0;
         Mij = Mi+Mj;
         M[i] = 0.5*Mij;
         M[j] = M[i];

         struct[i]="."; 
         struct[j]="."; 
         break;
       }
     }
       print i+1-startpos"          "Ipos[i]+M[i] >> "align.info";
   }




   #print beginning of symvec file
   print "* alpro 1.64";
   print "*.";
   print "*.";
   print "* RNA STRUCTURE ALIGNMENT";   
   print "* position, samples, information, variance";   
   if(structlogo==1) print "6 number of symbols";   
   if(structlogo==0) print "5 number of symbols";

   # calculating the total information content
   for(ii=startpos;ii<seqlength+startpos;ii++)
   {
      i=ii-startpos+1;

      if(structlogo==1)
      {
         tmp=int(10000*M[i]);
         tmp2 = sqrt(tmp*tmp);
         Jnum=tmp2;
         Mnum = int(10000*M[i]);
         if(M[i]<0) Mnum=-tmp2;
         else Mnum=tmp2;
         Jtot = Ipos[i] + M[i];
      }
      else
      {
         Jnum=0;
         Jtot = Ipos[i];
      }
      
      if(Itype==1)
      {   
        for(n=1;n<6;n++)
        {
          tmp=int(10000.0*q[n,i]*Ipos[i]);
	  if(q[n,i]<p[n,i]) Inum[n] = -tmp;
          else Inum[n] = tmp;
          tmp2 = sqrt(tmp*tmp);
          Jnum = Jnum+tmp2;
        }
      }
      else if(Itype==2)
      {
        NORM=0;
        for(n=1;n<6;n++)  NORM += q[n,i]/p[n,i];

        for(n=1;n<6;n++)
        {
          tmp=int(10000.0 * ( (q[n,i]/p[n,i])/NORM) * Ipos[i]);
	  if(q[n,i]<p[n,i]) Inum[n] = -tmp;
          else Inum[n] = tmp;
          tmp2 = sqrt(tmp*tmp);
          Jnum = Jnum+tmp2;
        }
      }



      letter[1]="A";
      letter[2]="C";
      letter[3]="G";
      letter[4]="U";
      letter[5]="-";
      

      for(n=1;n<6;n++) Tnum[n]=sqrt(Inum[n]*Inum[n]);

      for(n=1;n<6;n++)
      {
         for(m=n+1;m<6;m++)
	 {
            if(Tnum[n]>Tnum[m])
	    {
               Ttmp=Tnum[m];
               Tnum[m]=Tnum[n];
               Tnum[n]=Ttmp;

               Itmp=Inum[m];
               Inum[m]=Inum[n];
               Inum[n]=Itmp;

               ltmp=letter[m];
               letter[m]=letter[n];
               letter[n]=ltmp;
            }
         }
      }

      sizecheck=0;
      for(n=1;n<6;n++) if(Inum[n]>0) sizecheck=1;
      if(sizecheck==0) Jtot=0.0;
      printf("%2d  %5d   %f   0\n", ii, Jnum, Jtot);

      for(n=1;n<6;n++) printf("%c   %d\n", letter[n], Inum[n]);


      if(structlogo==1) printf("M   %d\n",Mnum);



      
      Jseq+=Jtot;

   }

    print "Information content for the whole alignment:  "Jseq >> "align.info";

}


















