/* cmfinder.c
 * Zizhen Yao
 * 
 * CVS $Id: cmfinder.c,v 3.1 2006/03/07 19:38:36 yzizhen Exp $
 * 
 * The main body of CMfinder. 
 * Iteratively refine the alignment and CM model using EM algorithm.
 */
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <time.h>
#include <math.h>


#include "squid.h"		/* general sequence analysis library    */
#include "msa.h"                /* squid's multiple alignment i/o       */
#include "structs.h"		/* data structures, macros, #define's   */
#include "funcs.h"		/* external functions                   */
#include "version.h"            /* versioning info for Infernal         */
#include "prior.h"

#include "cand.h"

#ifdef MEMDEBUG
#include "dbmalloc.h"
#endif

#define NEGINFINITY  -9999999

static char usage[]  = "\
Usage: cmfinder [-options] <seqfile in> <cmfile output> \n\
where options are:\n\
     -c <candidate file>: the candidate file \n\
     -a <align file>    : the initial motif alignment \n\
     -i <cm file>       : the initial covariance model\n\
     -o <align file>    : the output motif structural alignment in stockholm format \n\
     -v verbose         : print intermediate alignments \n\
     -h                 : print short help and version info\n\
";

static char experts[] = "\
  Expert, in development, or infrequently used options are:\n\
   --g <gap threshold> : the gap threshold to determine the conserved column\n\
   --informat <s>: specify that input alignment is in format <s>\n\
";


static struct opt_s OPTIONS[] = {
  { "-c", TRUE, sqdARG_STRING}, 
  { "-a", TRUE, sqdARG_STRING}, 
  { "-i", TRUE, sqdARG_STRING}, 
  { "-o", TRUE, sqdARG_STRING}, 
  { "-v", TRUE, sqdARG_NONE }, 
  { "-h", TRUE, sqdARG_NONE },
  { "--t", FALSE, sqdARG_STRING},
  { "--g", FALSE, sqdARG_FLOAT},
  { "--informat", FALSE, sqdARG_STRING},
};
#define NOPTIONS (sizeof(OPTIONS) / sizeof(struct opt_s))

static char banner[] = "cmfinder: learning motif covariance model for unaligned sequences\n";

extern int save_hit(int left, int right, double score);
extern int init_hits(char* s);
extern int return_hits(Cand** ret_cand, int * ret_ncand);
extern CM_t * Automodelmaker(MSA *msa, char** dsq, double gapthresh, int gapcost);
MSA *  Parsetrees2Alignment(CM_t *cm, char **dsq, SQINFO *sqinfo, float *wgt, Parsetree_t **tr, int nseq);

static int    do_local = 0;
static int    do_small = 1;
static int    do_binary = 0;
static int    do_banded = 0;  /* Not ready in infernal */
static int    watson_crick=1;

#define IsBasePair(l, r) (l + r== 3 || l+r == 5)


CM_t * M_step (MSA* msa, char **dsq, Prior_t *pri, double *null, float gapthreshold, int gapcost)
{ 
  CM_t * cm;  
  int  i; 
  FILE      *cmfp;		  /* OUTPUT: fp to cvfile                      */

  cm = Automodelmaker( msa, dsq, gapthreshold, gapcost);
  
 /* Convert to probabilities, and the global log-odds form
  * we save the model in.
  */  
  PriorifyCM(cm, pri);
  for(i=0; i < Alphabet_size; i++)
    cm->null[i] = null[i];  

  if (do_local) ConfigLocal(cm, 0.5, 0.5);  
  CMLogoddsify(cm);
  CMHackInsertScores(cm);	/* "TEMPORARY" fix for bad priors */

  if ((cmfp = fopen("latest.cm", "w")) == NULL)
    Die("Failed to open %s for writing", "latest.cm");
  CMFileWrite(cmfp, cm, do_binary);
  fclose(cmfp);  

  return cm;  
}



MSA* E_step(struct cm_s *cm, int nseq, SQINFO *sqinfo, double * seq_weight, 
	    int* ncand, Cand** cand, char*** cand_dsq, 
	    int max_chosen_cand, double *ret_score)
{  
  static const double gamma0 = 0.3;  
  static double gamma = 0.3;

  
  Cand    **chosen;  
  char    **chosen_dsq;         /* two dim array of all cands for each sequence */
  SQINFO  *chosen_sqinfo;       /* The chosen cands for each sequence        */
  float  *chosen_weight;       
  Parsetree_t **tr;         /* tracebacks for a sequence              */
  Parsetree_t **chosen_tr;  /* tracebacks for each chosen cand              */

  int     i, j;  
  double  sum_prob_odd = 0;
  double  score;  
  double  bestscore;  
  double  totscore = 0;    
  int     chosen_idx=0;       /* The index of chosen candidate */
  Cand**  sort_cand;  
  int     free_dsq=0;
  MSA     *msa;
  double  lambda;  



  if (cand_dsq== NULL) {
    cand_dsq = (char***) malloc (sizeof(char**) * nseq);
    for(i=0; i < nseq; i++) {
      cand_dsq[i] = (char**) malloc(sizeof(char*) * ncand[i]);
      for(j=0; j < ncand[i]; j++) {
	cand_dsq[i][j] = DigitizeSequence(cand[i][j].seq, cand[i][j].len);
      }
    }
    free_dsq=1;
  }

  chosen = (Cand **) malloc( sizeof(Cand *) * nseq * max_chosen_cand);
  memset(chosen, 0, sizeof(Cand*) * nseq * max_chosen_cand);  
  chosen_tr = (Parsetree_t **) malloc (sizeof(Parsetree_t *) *nseq * max_chosen_cand);

  tr = (Parsetree_t **) malloc ( MAXCAND * sizeof(Parsetree_t *));
  
  totscore = 0;
  chosen_idx = 0;
  for(i=0; i < nseq; i++) {
    if (ncand[i] <= 0) 
      continue;
    if (ncand[i] > MAXCAND) Die("Too many candidates in sequence %d", i);    
    /* Calculate alignment score of each cadidate */
    bestscore = NEGINFINITY;    
    sum_prob_odd = 0;    
    for (j = 0; j < ncand[i]; j++) 
      {
	if (do_small) 
	  score = CYKDivideAndConquer(cm, cand_dsq[i][j], cand[i][j].len, 0, 1, cand[i][j].len, &tr[j]);
	else
	  score = CYKInside(cm, cand_dsq[i][j], cand[i][j].len, 0, 1, cand[i][j].len, &tr[j]);	
	cand[i][j].score = score;      
	sum_prob_odd += pow(2, score);      

      }

    lambda  = gamma / sqinfo[i].len;    
    /* Calculate motif weight using ZOOP model*/
    for(j = 0; j < ncand[i]; j++) {

      cand[i][j].weight =pow(2, cand[i][j].score)*lambda / (1 - gamma + sum_prob_odd * lambda);			
      totscore += cand[i][j].score * cand[i][j].weight;	
    }

    /* select Candidate */
    /* Sort candidate according to the score */
    sort_cand = SortCand(cand[i], ncand[i], CompCandByScore);
    /* select the best candidate */
    bestscore = sort_cand[0]->score;

    /* select the best max_chosen_cand whose scores are within the range */
    j=0;
    while (j < max_chosen_cand && j < ncand[i] &&
	   sort_cand[j]->score + log(ncand[i]) >= 0.1 * bestscore)
      {      
	chosen_tr[chosen_idx] = tr[ sort_cand[j]->cand_id];      
	chosen[chosen_idx++] = sort_cand[j];
	j++;      
      }      
    /* Free other traces */
    for (;j < ncand[i]; j++) 
      {      
	FreeParsetree(tr[ sort_cand[j]->cand_id]);
      }
    free(sort_cand);  
  }

  gamma = 0;  
  for(i=0; i < nseq; i++) {    
    for(j = 0; j < ncand[i]; j++) {      
      gamma += cand[i][j].weight;      
    }    
  }  
  gamma  = (gamma + gamma0) / (nseq + 1);  
  if (ret_score) *ret_score = totscore;  

  chosen_dsq = (char**) malloc (sizeof(char*) * chosen_idx);
  for(i=0; i < chosen_idx; i++) {
    chosen_dsq[i] = cand_dsq[chosen[i]->seq_id][chosen[i]->cand_id];
  }

  /* transform Cand data structure to SQINFO */
  chosen_sqinfo = Cand2Sqinfo(chosen, chosen_idx, sqinfo);  
  chosen_weight = (float*) malloc(sizeof(float) * chosen_idx);
  for(i=0; i < chosen_idx; i++) {
    chosen_weight[i] =  chosen[i]->weight;
    if (seq_weight) chosen_weight[i] *= seq_weight[chosen[i]->seq_id];    
  }      
  msa = Parsetrees2Alignment(cm, chosen_dsq, chosen_sqinfo, chosen_weight, chosen_tr, chosen_idx);  
  for (i = 0; i < chosen_idx; i++) 
    {
      FreeParsetree(chosen_tr[i]);
    } 
  free(chosen_tr);
  free(chosen_weight);
  free(chosen_dsq);
  free(chosen_sqinfo);
  free(chosen);  
  free(tr);    

  if (free_dsq) {
    for(i=0; i < nseq;i++){
      Free2DArray((void**)cand_dsq[i], ncand[i]);
    }
    free(cand_dsq);
  }
  return (msa);
}

void  CMScan(CM_t *cm, int nseq, char **dsq, char** rseqs, int window, Cand **cand, int *ncand)
{
  static double   bandp = 0.0001;		/* tail loss probability for banding */
  int   i,j;
  int    nhits;			/* number of hits in a seq */
  int   *hitr;			/* initial states for hits */
  int   *hiti;                  /* start positions of hits */
  int   *hitj;                  /* end positions of hits */
  float *hitsc;			/* scores of hits */
  int    len;  
  int   total_cand = 0;  
 

  for(i=0; i < nseq; i++) {     
    CYKScan(cm, dsq[i], strlen(rseqs[i]), window,
	    &nhits, &hitr, &hiti, &hitj, &hitsc);  
    ncand[i] = 0;    
    if (nhits > 0) {      
      if (nhits > MAXCAND) nhits = MAXCAND;      
      memset(cand[i], 0, sizeof(Cand) * nhits);      
      ncand[i] = nhits;      
      total_cand += nhits;      
      for(j=0; j < nhits; j++) {  
	len= hitj[j] -  hiti[j] + 1;	
	if (len > MAXLENGTH - 1)  Die ("Candidate too long!\n");	

	cand[i][j].start =  hiti[j];
	cand[i][j].stop =   hitj[j];
	cand[i][j].score =  hitsc[j];
	cand[i][j].cand_id = j;
	cand[i][j].seq_id = i;	
	cand[i][j].len =  len;	
	if (len > MAXLENGTH) Die("Sequence too long %d", len);
	strncpy(cand[i][j].seq, rseqs[i] + cand[i][j].start - 1, len);
	cand[i][j].seq[len] = '\0';    							  
      }   
    }
    free(hitr);
    free(hiti);
    free(hitj);
    free(hitsc);          
  }  

  if (nhits > MAXCAND) 
    MAXCAND=nhits;  
  if (total_cand == 0) {
    Die("No candidates are found in CM Scan");  
  }
}


int
main(int argc, char **argv)
{
  int        format;              /* alifile format                            */
  char      *seqfile=NULL;        /* training sequence file                    */
  char      *alifile=NULL;        /* file contain the initial alignment of selected cand */
  char      *candfile = NULL;     /* file contain candidate motifs            */
  char      *cmfile=NULL;         /* OUTPUT: saved cvhmm                       */
  char      *final_file = NULL;   /* files contains the final alignment of motifs */
  CMFILE    *in_cmfp;	          /* open CM file */
  FILE      *cmfp;		  /* OUTPUT: fp to cvfile                      */
   
  char 	    **rseqs;	          /* training sequences                        */
  char      **dsq;                /* Digitized training sequences              */
  SQINFO    *sqinfo;		  /* array of sqinfo structures for rseqs      */
  int  	    nseq;		  /* number of seqs */                           
 
  Cand      **cand;        /* all the candidates for all the sequences */
  int       *ncand;	  /* the number of cands of each sequence */     
  char      ***cand_dsq;          /* digitized seqence of candidates      */

  CM_t     *cm      = NULL;       /* current model                        */
  CM_t     *best_cm = NULL;       /* best model so far                    */

  double    bestcmscore;     /* The score of the best cm */  
  double    totscore;	     /* summed scores over training seqs          */
  double    oldscore;	     /* previous totscore for old model           */
  double    delta;	     /* fractional change in scores for iteration */
  int       iteration;	     /* iteration number we're on                 */
  double    gapthreshold=0.5;/* gap threshold to determine the conserved columns */  
  double    gapcost     =100;/* gap penalty in M step */  
  int       max_iterations;
  double    threshold;		/* fractional tolerance, test for convergence   */   


  char      *in_cmfile = NULL;		/* file containing input model       */
  Prior_t   *pri       = NULL;           /* mixture Dirichlet prior structure */

  double    *seq_weight= NULL;    
  MSA       *msa       = NULL;  
  char     **msa_dsq   = NULL;  
  double     null[4];  
  double     nt_count = 0;  

  char  *optname;                /* name of option found by Getopt()        */
  char  *optarg;                 /* argument found by Getopt()              */
  int    optind;                 /* index in argv[]                         */	
  int    verbose=0;
  int    temp;  
  int    i, j;
  int    max_cand;
  int    window        = 200;    /* for ViterbiScan */
  

  
  /*Parse command line */	

#ifdef MEMDEBUG
  unsigned long histid1, histid2, orig_size, current_size;
#endif

  /*********************************************** 
   * Parse commnd line
   ***********************************************/


  threshold            = 0.02;		/* default: 1% */
  max_iterations       = 100;
  format               = MSAFILE_STOCKHOLM;  

  while (Getopt(argc, argv, OPTIONS, NOPTIONS, usage,
                &optind, &optname, &optarg))  {
    if      (strcmp(optname, "-c") == 0)        candfile  = optarg;     
    else if (strcmp(optname, "-a") == 0)        alifile   = optarg;     
    else if (strcmp(optname, "-i") == 0)        in_cmfile = optarg;     
    else if (strcmp(optname, "-o") == 0)        final_file= optarg;     
    else if (strcmp(optname, "-v") == 0)        verbose   = 1;
    else if (strcmp(optname, "--g") == 0)       gapthreshold   = atof(optarg);
    else if (strcmp(optname, "--informat") == 0){
      format = String2SeqfileFormat(optarg);
      if (format == MSAFILE_UNKNOWN) 
	Die("unrecognized sequence file format \"%s\"", optarg);
      if (! IsAlignmentFormat(format))
	Die("%s is an unaligned format, can't read as an alignment", optarg);
    }
    else if (strcmp(optname, "-h") == 0) {
      puts(banner);
      puts(usage);
      puts(experts);
      exit(EXIT_SUCCESS);
    }    
  }  
  if (argc - optind < 2) Die("%s\n", usage);
  
  seqfile = argv[optind++];  
  cmfile  = argv[optind++];  


  /*********************************************** 
   * Get sequence data
   ***********************************************/
  /* read the training seqs from file */
  if (! ReadMultipleRseqs(seqfile, SQFILE_FASTA, &rseqs, &sqinfo, &nseq))
    Die("Failed to read any sequences from file %s", seqfile);
  
  /* Preprocess */
  for (i = 0; i < nseq; i++){
    PrepareSequence(rseqs[i]);
  }  
  dsq = (char **) malloc(sizeof(char *) * nseq);
  for(i=0; i < Alphabet_size; i++) null[i]=0;  
  for (i = 0; i < nseq; i++) {    
    dsq[i] = DigitizeSequence(rseqs[i], sqinfo[i].len);
    for(j=1; j <= sqinfo[i].len; j++) {      
      if (dsq[i][j] < 0 || dsq[i][j] >= Alphabet_size) 
	Die("Seq %d position %d: Invalid letter", i, j, dsq[i][j]);      
      null[dsq[i][j]] ++;    
    }    
    nt_count += sqinfo[i].len;    
  }  
  for(i=0; i < Alphabet_size; i++) {    
    null[i] /= nt_count;
  }

  /* Get prior */
  pri = Prior_Default();

  /*Construct Initial CM */
  if (in_cmfile){
    /* Read CM File*/
    if ((in_cmfp = CMFileOpen(in_cmfile, NULL)) == NULL)
      Die("Failed to open covariance model save file %s\n%s\n", in_cmfile, usage);
    if (! CMFileRead(in_cmfp, &cm))
      Die("Failed to read a CM from %s -- file corrupt?\n", in_cmfile);
    if (cm == NULL) 
      Die("%s empty?\n", in_cmfile);
    CMFileClose(in_cmfp);

    /* Construct CM */
    if (do_local) ConfigLocal(cm, 0.5, 0.5);  
    CMLogoddsify(cm);
  }  
  else if (alifile){
    /* Read initial alignment file*/
    MSAFILE     *afp = NULL;        /* file handle of initial alignment          */    
    if ((afp = MSAFileOpen(alifile, format, NULL)) == NULL)
      Die("Alignment file %s could not be opened for reading", alifile);
    if ((msa = MSAFileRead(afp)) != NULL)
      {
	if (msa->alen *1.5 > window){
	  window = msa->alen * 1.5;
	}
	for (i = 0; i < msa->nseq; i++){
	  PrepareSequence(msa->aseq[i]);
	}	
	/* Estimate CM */
	MSAFileClose(afp);
	
        msa_dsq = DigitizeAlignment(msa->aseq, msa->nseq, msa->alen);
	cm = M_step(msa, msa_dsq, pri, null, gapthreshold, 50);	
	Free2DArray((void **)msa_dsq, msa->nseq);      
	MSAFree(msa);
      }
    else Die("Fail to read alignment file %s", alifile);
  }
  else {
    Die("No initial Covariance Model or initial alignment is available");
  }

  if (!candfile) {
    cand = (Cand **) malloc(sizeof(Cand*) * nseq);
    ncand = (int *) malloc(sizeof(int) * nseq);    
    for(i =0 ; i < nseq; i++)
      cand[i] = (Cand *)malloc(sizeof(Cand) *MAXCAND);        
  }
  else{    
    cand = Read2DCand(candfile,  nseq, &ncand, &max_cand);    
    cand_dsq = malloc(sizeof(char **) * nseq);

    for(i=0; i < nseq; i++) {
      if (ncand[i] <= 0) {
	cand_dsq[i] = NULL;
	continue;
      }      
      cand_dsq[i] = (char**) malloc(sizeof(char*) * ncand[i]);
      for(j=0; j < ncand[i]; j++) {
	cand_dsq[i][j] = DigitizeSequence(cand[i][j].seq, cand[i][j].len);
      }
    }
    bestcmscore =  -1.0 * HUGE_VAL;
    oldscore = -1.0 * HUGE_VAL;
    iteration = 0;
    while( iteration < max_iterations ) {    
      iteration++;
      if (verbose)
	printf("Iteration %4d  : model of %d nodes\n ", iteration, cm->nodes);
      
      /* Make a search model
       */
      msa = E_step(cm, nseq, sqinfo, seq_weight, ncand, cand, cand_dsq, 10, &totscore);      
      if (totscore < -10) {
	Die("Bad alignment %f!", totscore);	
      }             
      /* Keep the best model so far */
      if (totscore > bestcmscore) {      
	if (best_cm) FreeCM(best_cm);      
	best_cm = cm;
	bestcmscore = totscore;
      }    
      else{
	FreeCM(cm);
      }      
      /* If we've converged, stop.
       * Else, make a new model from the alignment. 
       */	
      delta = (totscore - oldscore) / fabs(totscore);   
      if (verbose)
	printf("score %.3f, delta %.3f\n", totscore / (double) nseq, delta);      
      if (delta < threshold)
	{		
	  /*Converge.  Break out of iteration loop.*/
	  break;	  
	}      
      oldscore = totscore;          
      if (verbose){
	WriteStockholm(stdout, msa);      
      }

      msa_dsq = DigitizeAlignment(msa->aseq, msa->nseq, msa->alen);
      cm = M_step(msa, msa_dsq, pri, null, gapthreshold, 50);
      Free2DArray((void **)msa_dsq, msa->nseq);      
      MSAFree(msa); 
    }    
    
    cm = best_cm;
    best_cm = NULL;    

    for(i=0; i < nseq; i++) {      
      if (ncand[i] > 0) Free2DArray((void **)cand_dsq[i], ncand[i]);      
    }
    free(cand_dsq);    
  }  
  

  /* Now Scan the whole sequence looking to candidates. */    

  for(i=0; i< nseq; i++)  ncand[i] = 0;  

  bestcmscore = NEGINFINITY;  
  oldscore = NEGINFINITY;  
  iteration = 0;
    
  while( iteration < max_iterations ) {    
    iteration++;
    if (verbose)
      printf("Iteration %4d  : model of %d nodes\n ", iteration, cm->nodes);    

    CMScan(cm, nseq, dsq, rseqs, window, cand, ncand);    
    totscore = 0.0;       
    
    msa = E_step(cm, nseq, sqinfo, seq_weight, ncand, cand, NULL, 10, &totscore);          

    /* Keep the best model so far */
    if (totscore >= bestcmscore) {      
      if (best_cm) FreeCM(best_cm);      
      best_cm = cm;
      bestcmscore = totscore;
    }    
    else{
      FreeCM(cm);
    }    

    if (totscore < -10) {
      Die("Bad alignment %f!", totscore);	
    } 

    delta = (totscore - oldscore) / fabs(totscore);   
    if(verbose)
      printf("score %.3f, delta %.3f\n", totscore / (double) nseq, delta);    
   
   

    /* If we've converged, stop.
     * Else, make a new model from the alignment. 
     */	
    if (delta < threshold )
      {		
	/* we've converged. Free traces and break out of iteration loop.*/
	break;	  
      }
    oldscore = totscore;    	
    if (verbose){
      WriteStockholm(stdout, msa);
    }
    msa_dsq = DigitizeAlignment(msa->aseq, msa->nseq, msa->alen);
    cm = M_step(msa, msa_dsq, pri, null, gapthreshold, gapcost);

    Free2DArray((void **)msa_dsq, msa->nseq);      
    MSAFree(msa); 
  }            

  /**********************************************
   * Save the final alignment of selected motifs
   *********************************************/
  if (best_cm != cm) {
    cm = best_cm; 
    MSAFree(msa); 
    CMScan(cm, nseq, dsq, rseqs, window, cand, ncand);    
  }
  msa = E_step(cm, nseq, sqinfo, seq_weight, ncand, cand, NULL, 1, &totscore);      
  for(i=0; i < nseq;i++)  if (ncand[i]>0) free(cand[i]);  
    
  if (verbose)
    printf("Final score %.3f\n", totscore / (double) nseq);  
  
  if (final_file != NULL)
    {
      FILE        *final_fout;            
      if ((final_fout = fopen(final_file, "w")) == NULL)   
	Die("Can't write to output file ");      
      WriteStockholm(final_fout, msa);
      if (verbose)
	printf("Alignment saved in file %s\n", final_file);
      
      fclose(final_fout);      
    }
  else
    WriteStockholm(stdout, msa);  


  /*********************************************** 
   * Save the new model and exit.
   ***********************************************/

  if ((cmfp = fopen(cmfile, "w")) == NULL)
    Die("Failed to open %s for writing", cmfile);
  CMFileWrite(cmfp, cm, do_binary);
  fclose(cmfp);  

  free(cand);
  free(ncand);  
  Prior_Destroy(pri);
  MSAFree(msa);
  FreeCM(cm);
  for (i = 0; i < nseq; i++)
    FreeSequence(rseqs[i], &(sqinfo[i]));
  Free2DArray((void**)dsq, nseq);
  free(sqinfo);
  if (verbose)
    printf("New covariance model written to file %s\n", cmfile);  
  
  return 0;
  
}


/* Parsetrees2Alignment 
 * Copy From Infernal package 
 */
MSA *
Parsetrees2Alignment(CM_t *cm, char **dsq, SQINFO *sqinfo, float *wgt, 
		     Parsetree_t **tr, int nseq)
{
  MSA         *msa;          /* multiple sequence alignment */
  CMEmitMap_t *emap;         /* consensus emit map for the CM */
  int          i;            /* counter over traces */
  int          v, nd;        /* state, node indices */
  int          cpos;         /* counter over consensus positions (0)1..clen */
  int         *matuse;       /* TRUE if we need a cpos in mult alignment */
  int         *iluse;        /* # of IL insertions after a cpos for 1 trace */
  int         *eluse;        /* # of EL insertions after a cpos for 1 trace */
  int         *iruse;        /* # of IR insertions after a cpos for 1 trace */
  int         *maxil;        /* max # of IL insertions after a cpos */
  int         *maxel;        /* max # of EL insertions after a cpos */
  int         *maxir;        /* max # of IR insertions after a cpos */
  int	      *matmap;       /* apos corresponding to a cpos */
  int         *ilmap;        /* first apos for an IL following a cpos */
  int         *elmap;        /* first apos for an EL following a cpos */
  int         *irmap;        /* first apos for an IR following a cpos */
  int          alen;	     /* length of msa in columns */
  int          apos;	     /* position in an aligned sequence in MSA */
  int          rpos;	     /* position in an unaligned sequence in dsq */
  int          tpos;         /* position in a parsetree */
  int          el_len;	     /* length of an EL insertion in residues */
  CMConsensus_t *con;        /* consensus information for the CM */
  int          prvnd;	     /* keeps track of previous node for EL */

  emap = CreateEmitMap(cm);

  matuse = malloc(sizeof(int)*(emap->clen+1));   
  iluse  = malloc(sizeof(int)*(emap->clen+1));   
  eluse  = malloc(sizeof(int)*(emap->clen+1));   
  iruse  = malloc(sizeof(int)*(emap->clen+1));   
  maxil  = malloc(sizeof(int)*(emap->clen+1));   
  maxel  = malloc(sizeof(int)*(emap->clen+1));   
  maxir  = malloc(sizeof(int)*(emap->clen+1));   
  matmap = malloc(sizeof(int)*(emap->clen+1));   
  ilmap  = malloc(sizeof(int)*(emap->clen+1));   
  elmap  = malloc(sizeof(int)*(emap->clen+1));   
  irmap  = malloc(sizeof(int)*(emap->clen+1));   
  
  for (cpos = 0; cpos <= emap->clen; cpos++) 
    {
      matuse[cpos] = 0;
      maxil[cpos] = maxel[cpos] = maxir[cpos] = 0;
      ilmap[cpos] = elmap[cpos] = irmap[cpos] = 0;
    }

  /* Look at all the traces; find maximum length of
   * insert needed at each of the clen+1 possible gap
   * points. (There are three types of insert, IL/EL/IR.)
   * Also find whether we don't need some of the match
   * (consensus) columns.
   */
  for (i = 0; i < nseq; i++) 
    {
      for (cpos = 0; cpos <= emap->clen; cpos++) 
	iluse[cpos] = eluse[cpos] = iruse[cpos] = 0;

      for (tpos = 0; tpos < tr[i]->n; tpos++)
	{
	  v  = tr[i]->state[tpos];
	  if (cm->sttype[v] == EL_st) nd = prvnd;
	  else                        nd = cm->ndidx[v];
	  
	  switch (cm->sttype[v]) {
	  case MP_st: 
	    matuse[emap->lpos[nd]] = 1;
	    matuse[emap->rpos[nd]] = 1;
	    break;
	  case ML_st:
	    matuse[emap->lpos[nd]] = 1;
	    break;
	  case MR_st:
	    matuse[emap->rpos[nd]] = 1;
	    break;
	  case IL_st:
	    iluse[emap->lpos[nd]]++;
	    break;
	  case IR_st:		
            /* remember, convention on rpos is that IR precedes this
             * cpos. Make it after the previous cpos, hence the -1. 
	     */
	    iruse[emap->rpos[nd]-1]++;
	    break;
	  case EL_st:
	    el_len = tr[i]->emitr[tpos] - tr[i]->emitl[tpos] + 1;
	    eluse[emap->epos[nd]] = el_len;
            /* not possible to have >1 EL in same place; could assert this */
	    break;
	  }

	  prvnd = nd;
	} /* end looking at trace i */

      for (cpos = 0; cpos <= emap->clen; cpos++) 
	{
	  if (iluse[cpos] > maxil[cpos]) maxil[cpos] = iluse[cpos];
	  if (eluse[cpos] > maxel[cpos]) maxel[cpos] = eluse[cpos];
	  if (iruse[cpos] > maxir[cpos]) maxir[cpos] = iruse[cpos];
	}
    } /* end calculating lengths used by all traces */
  

  /* Now we can calculate the total length of the multiple alignment, alen;
   * and the maps ilmap, elmap, and irmap that turn a cpos into an apos
   * in the multiple alignment: e.g. for an IL that follows consensus position
   * cpos, put it at or after apos = ilmap[cpos] in aseq[][].
   * IR's are filled in backwards (3'->5') and rightflushed.
   */
  alen = 0;
  for (cpos = 0; cpos <= emap->clen; cpos++)
    {
      if (matuse[cpos]) {
	matmap[cpos] = alen; 
	alen++;
      } else 
	matmap[cpos] = -1;

      ilmap[cpos] = alen; alen += maxil[cpos];
      elmap[cpos] = alen; alen += maxel[cpos];
      alen += maxir[cpos]; irmap[cpos] = alen-1; 
    }

  /* We're getting closer.
   * Now we can allocate for the MSA.
   */
  msa = MSAAlloc(nseq, alen);
  msa->nseq = nseq;
  msa->alen = alen;
  msa->ss   = (char **)malloc(sizeof(char *) * nseq);    
  for (i = 0; i < nseq; i++)
    {	
      msa->ss[i] = (char *)malloc(sizeof(char) * (alen + 1));      
    }
  
  for (i = 0; i < nseq; i++){    
      /* Initialize the aseq with all pads '.' (in insert cols) 
       * and deletes '-' (in match cols).
       */



      for (apos = 0; apos < alen; apos++){	
	msa->aseq[i][apos] = '.';
	msa->ss[i][apos] = '.';
      }

      for (cpos = 0; cpos <= emap->clen; cpos++)
	if (matmap[cpos] != -1) msa->aseq[i][matmap[cpos]] = '-';


      msa->aseq[i][alen] = '\0';
      msa->ss[i][alen] = '\0';

      /* Traverse this guy's trace, and place all his
       * emitted residues.
       */
      for (cpos = 0; cpos <= emap->clen; cpos++)
	iluse[cpos] = iruse[cpos] = 0;

      for (tpos = 0; tpos < tr[i]->n; tpos++) 
	{
	  v  = tr[i]->state[tpos];	 

	  if (cm->sttype[v] == EL_st) nd = prvnd;
	  else                        nd = cm->ndidx[v];

	  switch (cm->sttype[v]) {
	  case MP_st:
	    {
	      char lc,rc;
	      int  l,r;	      
	      cpos = emap->lpos[nd];
	      apos = matmap[cpos];
	      rpos = tr[i]->emitl[tpos];
	      msa->aseq[i][apos] = Alphabet[(int) dsq[i][rpos]];
	      msa->ss[i][apos] = '<';	    
	      lc= dsq[i][rpos];
	      l = apos;
	      
	      cpos = emap->rpos[nd];
	      apos = matmap[cpos];
	      rpos = tr[i]->emitr[tpos];
	      msa->aseq[i][apos] = Alphabet[(int) dsq[i][rpos]];
	      msa->ss[i][apos] = '>';	    
	      rc= dsq[i][rpos];
	      r  = apos;	      

	      if(watson_crick && !IsBasePair(lc,rc)) {
		msa->ss[i][l]='-';
		msa->ss[i][r]='-';
	      }	      
	    }	    
	    break;	    
	  case ML_st:
	    cpos = emap->lpos[nd];
	    apos = matmap[cpos];
	    rpos = tr[i]->emitl[tpos];
	    msa->aseq[i][apos] = Alphabet[(int) dsq[i][rpos]];
	    break;

	  case MR_st:
	    cpos = emap->rpos[nd];
	    apos = matmap[cpos];
	    rpos = tr[i]->emitr[tpos];
	    msa->aseq[i][apos] = Alphabet[(int) dsq[i][rpos]];
	    break;

	  case IL_st:
	    cpos = emap->lpos[nd];
	    apos = ilmap[cpos] + iluse[cpos];
	    rpos = tr[i]->emitl[tpos];
	    msa->aseq[i][apos] = tolower((int) Alphabet[(int) dsq[i][rpos]]);
	    iluse[cpos]++;
	    break;

	  case EL_st: 
            /* we can assert eluse[cpos] always == 0 when we enter,
	     * because we can only have one EL insertion event per 
             * cpos. If we ever decide to regularize (split) insertions,
             * though, we'll want to calculate eluse in the rpos loop.
             */
	    cpos = emap->epos[nd]; 
	    apos = elmap[cpos]; 
	    for (rpos = tr[i]->emitl[tpos]; rpos <= tr[i]->emitr[tpos]; rpos++)
	      {
		msa->aseq[i][apos] = tolower((int) Alphabet[(int) dsq[i][rpos]]);
		apos++;
	      }
	    break;

	  case IR_st: 
	    cpos = emap->rpos[nd]-1;  /* -1 converts to "following this one" */
	    apos = irmap[cpos] - iruse[cpos];  /* writing backwards, 3'->5' */
	    rpos = tr[i]->emitr[tpos];
	    msa->aseq[i][apos] = tolower((int) Alphabet[(int) dsq[i][rpos]]);
	    iruse[cpos]++;
	    break;

	  case D_st:
	    if (cm->stid[v] == MATP_D || cm->stid[v] == MATL_D) 
	      {
		cpos = emap->lpos[nd];
		if (matuse[cpos]) msa->aseq[i][matmap[cpos]] = '-';
	      }
	    if (cm->stid[v] == MATP_D || cm->stid[v] == MATR_D) 
	      {
		cpos = emap->rpos[nd];
		if (matuse[cpos]) msa->aseq[i][matmap[cpos]] = '-';
	      }
	    break;

	  } /* end of the switch statement */
	  prvnd = nd;
	} /* end traversal over trace i. */

      /* Here is where we could put some insert-regularization code
       * a la HMMER: reach into each insert, find a random split point,
       * and shove part of it flush-right. But, for now, don't bother.
       */

    } /* end loop over all parsetrees */


  /* Gee, wasn't that easy?
   * Add the rest of the ("optional") information to the MSA.
   */
  con = CreateCMConsensus(cm, 3.0, 1.0);

  /* "author" info */
  msa->au   = malloc(sizeof(char) * (strlen(RELEASE)+10));
  sprintf(msa->au, "CMfinder %s", RELEASE);
  if (wgt != NULL) msa->flags |= MSA_SET_WGT;  
  for (i = 0; i < nseq; i++)
    {
      msa->sqname[i] = sre_strdup(sqinfo[i].name, -1);
      
      msa->sqlen[i]  = sqinfo[i].len;
      if (sqinfo[i].flags & SQINFO_ACC)
        MSASetSeqAccession(msa, i, sqinfo[i].acc);
      if (sqinfo[i].flags & SQINFO_DESC)
        MSASetSeqDescription(msa, i, sqinfo[i].desc);
      if (wgt == NULL) msa->wgt[i] = 1.0;
      else    {
	msa->wgt[i] = wgt[i];
      }
    }

  /* Construct the secondary structure consensus line, msa->ss_cons:
   *       IL, IR are annotated as .
   *       EL is annotated as ~
   *       and match columns use the structure code.
   * Also the primary sequence consensus/reference coordinate system line,
   * msa->rf.
   */
  msa->ss_cons = malloc(sizeof(char) * (alen+1));
  msa->rf = malloc(sizeof(char) * (alen+1));
  for (cpos = 0; cpos <= emap->clen; cpos++) 
    {
      if (matuse[cpos]) 
	{ /* CMConsensus is off-by-one right now, 0..clen-1 relative to cpos's 1..clen */

	  /* bug i1, xref STL7 p.12. Before annotating something as a base pair,
	   * make sure the paired column is also present.
	   */
	  if (con->ct[cpos-1] != -1 && matuse[con->ct[cpos-1]+1] == 0) {
	    msa->ss_cons[matmap[cpos]] = '.';
	    msa->rf[matmap[cpos]]      = con->cseq[cpos-1];
	  } else {
	    msa->ss_cons[matmap[cpos]] = con->cstr[cpos-1];	
	    msa->rf[matmap[cpos]]      = con->cseq[cpos-1];
	  }
	}
      if (maxil[cpos] > 0) 
	for (apos = ilmap[cpos]; apos < ilmap[cpos] + maxil[cpos]; apos++)
	  {
	    msa->ss_cons[apos] = '.';
	    msa->rf[apos] = '.';
	  }
      if (maxel[cpos] > 0)
	for (apos = elmap[cpos]; apos < elmap[cpos] + maxel[cpos]; apos++)
	  {
	    msa->ss_cons[apos] = '~';
	    msa->rf[apos] = '~';
	  }
      if (maxir[cpos] > 0)	/* remember to write backwards */
	for (apos = irmap[cpos]; apos > irmap[cpos] - maxir[cpos]; apos--)
	  {
	    msa->ss_cons[apos] = '.';
	    msa->rf[apos] = '.';
	  }
    }
  msa->ss_cons[alen] = '\0';
  msa->rf[alen] = '\0';

  FreeCMConsensus(con);
  FreeEmitMap(emap);
  free(matuse);
  free(iluse);
  free(eluse);
  free(iruse);
  free(maxil);
  free(maxel);
  free(maxir);
  free(matmap);
  free(ilmap);
  free(elmap);
  free(irmap);
  return msa;
}
