/* canda.c
 * Zizhen Yao
 *
 * CVS $Id: canda.c,v 3.1 2006/03/07 19:38:35 yzizhen Exp $
 * 
 * Align the seed candidates found by cands.
 */
 
#include <stdlib.h>
#include <string.h>
#include "cand.h"
#include "squid.h"
#include "treedist.h"
#include "edit_cost.h"
#include "dist_vars.h"


int BaseCode(char c)
{
  switch(c) {
  case 'A':
  case 'a':
    return 1;
  case 'C':
  case 'c':
    return 2;
  case 'G':
  case 'g':
    return 3;
  case 'T':
  case 't':
  case 'U':
  case 'u':
    return 4;        
  default:
    return 0;    
  }  
}

MSA* MultipleAlignment(SQINFO* sqinfo, Cand* cand, int ncand, int*** map, int*** rev_map)
{
  int    i, j, k, len, align_len, pos;
  int    *align_pos;    
  int    gap;
  int    temp;  
  int    base;  
  MSA    *msa;
  
  //Count the length of the alignment.
  len =  cand[0].len; 
  align_pos=(int *)MallocOrDie(sizeof(int) * (len + 1));
  
  for(i = 1; i <= len; i++) {
    gap = 1;    
    for(j=1; j < ncand; j++) {
      if ( i > 1) {	
	temp = map[j][0][i] - map[j][0][i-1];
      }
      else{
	temp = map[j][0][i];
      }
      if (temp > gap) 
	gap = temp;	      
    }    
    align_pos[i] = gap;    
    if (i > 1) align_pos[i] += align_pos[i-1];        
  }
  gap = 0;  
  for(j=1; j < ncand; j++) {
    temp = rev_map[j][0][0] - map[j][0][len];
    if (temp > gap)
      gap = temp;    
  }
  align_len = align_pos[len] + gap;  
  
  //Create the multiple alignment
  msa = MSAAlloc(ncand, align_len);
  msa->nseq = ncand;
  msa->alen = align_len;
  msa->ss   = MallocOrDie(sizeof(char *) * ncand);  

  for (i = 0; i < ncand; i++)
    {
      /* Initialize the aseq with all pads '.' (in insert cols) 
       * and deletes '-' (in match cols).
       */
      msa->ss[i] = MallocOrDie(sizeof(char) * (align_len + 1));      
      for (j = 0; j < align_len; j++){	
	msa->aseq[i][j] = '.';
	msa->ss[i][j] = '.';
      }                 
      msa->aseq[i][align_len] = '\0';
      msa->ss[i][align_len] = '\0';
    }
      
  //Prefix
  for(j=1; j < ncand; j++) {
    pos = 0;    
    for(k=1; rev_map[j][0][k] < 1; k++) {      
      temp = rev_map[j][1][k];
      if (temp > 0) {
	msa->aseq[j][pos] = cand[j].seq[temp-1];
	msa->ss[j][pos]= cand[j].ss[temp-1];
      }
      pos++;
    }
  }
  
  for(i=1; i <= len; i++) {    
    base = align_pos[i]-1;    
    msa->aseq[0][base] = cand[0].seq[i-1];
    for(j=1; j < ncand; j++) {
      k = map[j][0][i];
      pos = base;      
      while(rev_map[j][0][k] == i || (rev_map[j][0][k] == -1 && pos < align_len && k <= rev_map[j][0][0])) {
	temp = rev_map[j][1][k];
	if (temp > 0){
	  msa->aseq[j][pos] = cand[j].seq[temp-1];
	  msa->ss[j][pos] = cand[j].ss[temp-1];
	}
	pos++;
	k++;		
      }                  
    }    
  } 
  
  /* "author" info */
  msa->au   = MallocOrDie(sizeof(char) * 10);
  sprintf(msa->au, "canda ");
  
  for (i = 0; i < ncand; i++)
    {
      msa->sqname[i] = sre_strdup(sqinfo[i].name, -1);
      msa->sqlen[i]  = sqinfo[i].len;
      if (sqinfo[i].flags & SQINFO_ACC)
        MSASetSeqAccession(msa, i, sqinfo[i].acc);
      if (sqinfo[i].flags & SQINFO_DESC)
        MSASetSeqDescription(msa, i, sqinfo[i].desc);      
      msa->wgt[i] = 1.0;
    }

  return msa;
}



/* transform the  alignment  */
void TransformAlignment(char* align[2], int*** ret_map, int*** ret_rev_map)
{     
  

  int   i, pp;
  int   pos1, pos2;   
  int   len;      
  int** map = (int**) MallocOrDie(sizeof(int*)* 2);
  int** rev_map = (int**) MallocOrDie(sizeof(int*)* 2);
  
  len= strlen(align[0]);
  map[0] = (int*) MallocOrDie(sizeof(int) * (len + 1));
  map[1] = (int*) MallocOrDie(sizeof(int) * (len + 1));
  rev_map[0] = (int*) MallocOrDie(sizeof(int) * (len + 1));
  rev_map[1] = (int*) MallocOrDie(sizeof(int) * (len + 1));

  for(i = 0; i < len; i++) {
    rev_map[0][i] = -1;
    rev_map[1][i] = -1;    
  }

  pos1 =0;
  pos2 =0;    
  for(i = 1; i < len-1; i++) {  //Ignore the beginning '(' and ending ')'
    if ((align[0][i] == '(' && ! BaseCode(align[0][i+1])) ||
        (align[0][i] == ')' && ! BaseCode(align[0][i-1])) || 
	BaseCode(align[0][i])) {
      pos1++;
      map[0][pos1] = i;      
      rev_map[0][i] = pos1;      
    }
    if ((align[1][i] == '(' && ! BaseCode(align[1][i+1])) ||
        (align[1][i] == ')' && ! BaseCode(align[1][i-1])) || 
	BaseCode(align[1][i])) {
      pos2++;
      map[1][pos2] = i;      
      rev_map[1][i] = pos2;      
    }    
  }  
  map[0][0] = pos1;
  map[1][0] = pos2;

  //Remove the empty columns in the alignment;
  pp = 0;
  for(i=1; i < len-1; i++){
    if (rev_map[0][i] == -1 && rev_map[1][i] == -1)
      continue;
    pp ++;

    if (rev_map[0][i] >= 0) {
      pos1 = rev_map[0][i];      
      map[0][pos1] = pp;      
    }
    if (rev_map[1][i] >= 0) {
      pos2 = rev_map[1][i];      
      map[1][pos2] = pp;      
    }
  }    
  rev_map[0][0] = pp;
  rev_map[1][0] = pp;
  for(i=1; i <= pp; i++) {
    rev_map[0][i] = -1;
    rev_map[1][i] = -1;    
  }
  
  for(i=1; i <= map[0][0]; i++) {
    rev_map[0][map[0][i]] = i;    
  }
  
  for(i=1; i <= map[1][0]; i++) {
    rev_map[1][map[1][i]] = i;    
  }
   
  *ret_map = map;
  *ret_rev_map = rev_map;  
}




int main(int argc, char* argv[])
{
  
  char*   seqfile; 
  int     nseq;  
  char**  rseqs;
  SQINFO* sqinfo;
  int     format;  
  char*   align_file;
  FILE*   align_fout;  

  char*  candfile;
  int    ncand;
  Cand*  cand;  
  int    i,j,k;  
  
  Tree**   trees;
  double   dist;
  char*    struc; 
  int***   map;
  int***   rev_map;
    
  MSA     *msa;
  
  if (argc != 4) {
    fprintf(stderr, "Invalid parameter!. Usage: canda <cand_file> <seq_file> <out_file>\n");
    exit(1);    
  }
    
  candfile = argv[1];  
  seqfile = argv[2];  
  align_file = argv[3];  
 
  /*Search Motifs in each sequence. 
   * Produce allCands */
  nseq = 0;  
  cand = Read1DCand(candfile, &ncand);
    
  trees = (Tree**) MallocOrDie(sizeof(Tree*) * ncand);
  for(i =0 ; i < ncand; i++) {
    struc = ExpandFull(cand[i].ss, cand[i].seq);      
    trees[i] = make_tree(struc);      
    free(struc);
  }
  map = (int***)MallocOrDie(sizeof(int**) * ncand);
  rev_map = (int***)MallocOrDie(sizeof(int**) * ncand);
  
  
  i=0;  
  for(j=i+1; j < ncand; j++) {
    dist = tree_edit_distance(trees[i], trees[j]) ;    
    TransformAlignment(aligned_line, &map[j], &rev_map[j]);   
  }  

  format = SQFILE_FASTA;
  
  /* read the training seqs from file */
  if (! ReadMultipleRseqs(seqfile, format, &rseqs, &sqinfo, &nseq))
    Die("Failed to read sequences from file %s", seqfile);

  msa = MultipleAlignment(sqinfo, cand, ncand, map, rev_map);
  
  if (align_file != NULL && (align_fout = fopen(align_file, "w")) != NULL) 
    {
      WriteStockholm(align_fout, msa);
      printf("Alignment saved in file %s\n", align_file);
      fclose(align_fout);
    }
  else
    WriteStockholm(stdout, msa);
  
  for(j = 0; j < ncand; j++) {
    free_tree(trees[j]);
    if (j > 0) {	
      for(k=0; k < 2; k++) {	
	free(map[j][k]);
	free(rev_map[j][k]);
      }      
      free(map[j]);
      free(rev_map[j]);
    }        
  }
  free(trees);
  free(map);
  free(rev_map);
  free(cand);  
  
  for (i = 0; i < nseq; i++) 
    {
      FreeSequence(rseqs[i], &(sqinfo[i]));
    }
  free(sqinfo);  
  return 1;
}
