/*********************************************************************

  blast2col.c

  usage: blast2col --expect=EXPECT --length=LENGTH --score=SCORE
               --similarity=SIMILARITY --ignore=IGNORE --top=TOP
	       --mismatch=MISMATCH [FILE]

  This program takes input from stdin or FILE. The file should be in
  blast format.

  The options produce only the matches with an expect value less than
  or equal to EXPECT, a length of at least LENGTH and/or a score of at
  least SCORE. Only the top TOP matches are produced per subject
  sequence.

  Output is in the col format, through stdout.

  00301 Bjarne Knudsen (bk@daimi.au.dk)

  Genbank version



  Copyright (C) 2000 Bjarne Knudsen

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program; if not, write to the Free Software
  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  02111-1307, USA.

*********************************************************************/

#include <stdio.h>
#include <stdlib.h>
#include <ctype.h>
#include <string.h>
#include "../clib/file.h"
#include "../clib/dna.h"

void usage(void);

int main(int argc, char **argv)
{
  FILE *fp;
  char junk[80];        /* used with scanf */
  int junki;            /* used with scanf */
  char *version;        /* for the blast version */
  char query[80];          /* for the query sequence name */
  char subject[80];        /* for the subject sequence name */
  char subject_comment[240]; /* for the subject comments */
  unsigned int len;
  int subject_len;
  int align_len;
  char score[20], expect[20];
  int strand;           /* +1 is subject is plus strand, -1 otherwise */
  int ident;
  int numblock;
  int i, j;
  CmdArg *cmdarg;       /* Command line arguments */
  char *s;
  char query_seq[80], sbjct_seq[80];
  int query_len;
  char *full_query, *full_sbjct;
  int query_pos, sbjct_pos;
  int output;
  int number;
  double max_expect, min_sim;
  int min_length;
  double min_score;
  int max_mismatch;
  int top;
  int pos;
  char ignore[80];
  int ignore_len;
  int ignore_ident;
  double min_alignlen;

  cmdarg = InitArgument(argc, argv);

  min_length = 0;
  min_score = 0.;
  max_expect = 10000.;
  top = 10000;
  ignore_len = 0;
  min_sim = 0;
  max_mismatch = 10000;
  min_alignlen = 0;

  while ((s = GetArgument(cmdarg)) != NULL)
    if (strncmp(s, "-expect=", 8) == 0) {
      if (sscanf(s, "-expect=%lf%n", &max_expect, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-length=", 8) == 0) {
      if (sscanf(s, "-length=%d%n", &min_length, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-similarity=", 8) == 0) {
      if (sscanf(s, "-similarity=%lf%n", &min_sim, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-score=", 7) == 0) {
      if (sscanf(s, "-score=%lf%n", &min_score, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-top=", 5) == 0) {
      if (sscanf(s, "-top=%d%n", &top, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-mismatch=", 10) == 0) {
      if (sscanf(s, "-mismatch=%d%n", &max_mismatch, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-alignlen=", 10) == 0) {
      if (sscanf(s, "-alignlen=%lf%n", &min_alignlen, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
    }
    else if (strncmp(s, "-ignore=", 8) == 0) {
      if (sscanf(s, "-ignore=%s%n", ignore, &len) != 1 ||
	  len != strlen(s)) {
	usage();
	return 1; }
      ignore_len = len-8;
      if (ignore_len % 2 != 0) {
	usage();
	return 1; }
    }
    else {
      usage();
      return 1; }

  if ((s = GetFilename(cmdarg)) == NULL)
    fp = stdin;
  else if (GetFilename(cmdarg) != NULL) {
    fprintf(stderr, "Usage: blast2col [FILE]\n");
    return 1; }
  else if ((fp = fopen(s, "r")) == NULL) {
    fprintf(stderr, "blast2col: Error in opening file '%s'\n", s);
    return 1; }

  printf("; Generated by blast2col\n");
  printf("; ========================================================================\n");

  if ((version = GetLine(fp)) == NULL) {
    fprintf(stderr, "blast2col: Error 1 in input\n");
    return 1; }

  while ((s = GetLine(fp)) != NULL) {
    if (strncmp(s, "Query= ", 7) == 0) {
      s[strlen(s)-1] = '\0';
      sscanf(s, "%s %n", junk, &i);
      strncpy(query, s+i, 79);
      free(s);
      s = GetLine(fp);
      if (sscanf(s, " (%d letters)", &query_len) != 1) {
	fprintf(stderr, "blast2col: Error 3 in input\n");
	return 1; }
      free(s);

      number = 1;
    }
    else if (strncmp(s, ">",1 ) == 0) {
      sscanf(s, "%s%n", junk, &i);
      s[i] = '\0';
      strncpy(subject, s+1, 79);
      s[strlen(s+i+1)+i] = '\0';
      strncpy(subject_comment, s+i+1, 239);
      free(s);

      while ((s = GetLine(fp)) != NULL &&
	     sscanf(s, " Length = %d", &subject_len) != 1)
	free(s);

      s = GetLine(fp);
      free(s);
    }
    else if (strncmp(s, " Score =", 8) == 0) {
      if (sscanf(s, " Score = %s bits %s Expect = %s",
		 score, junk, expect) != 3) {
	fprintf(stderr, "blast2col: Error 3 in input\n");
	return 1; }
      free(s);

      s = GetLine(fp);
      if (sscanf(s, " Identities = %d/%d",
		 &ident, &align_len) != 2) {
	fprintf(stderr, "blast2col: Error 4 in input\n");
	return 1; }
      free(s);

      s = GetLine(fp);
      if (sscanf(s, " Strand = Plus / %s", junk) != 1) {
	fprintf(stderr, "blast2col: Error 5 in input\n");
	return 1; }
      free(s);

      if (StrCmp(junk, "Minus") == 0)
	strand = -1;
      else if (StrCmp(junk, "Plus") == 0)
	strand = 1;
      else {
	fprintf(stderr, "blast2col: Error 6 in input\n");
	return 1; }

      s = GetLine(fp);
      free(s);
      s = GetLine(fp);
      free(s);

      pos = 0;
      numblock = (align_len+59)/60;

      if (atof(expect) > max_expect ||
	  atof(score) < min_score ||
	  align_len < min_length ||
	  number > top ||
	  (double) align_len/query_len < min_alignlen ||
	  (ignore_len == 0 &&
	   ((double) ident/align_len < min_sim ||
	    align_len-ident > max_mismatch))) {
	for (i = 0; i < 5*numblock-1; i++) {
	  s = GetLine(fp);
	  free(s);
	}
	number++;
	continue; }

      full_query = (char *)malloc(align_len * sizeof(char));
      full_sbjct = (char *)malloc(align_len * sizeof(char));

      for (i = 0; i < numblock; i++) {
	s = GetLine(fp);
	if (sscanf(s, "%s %d %s", junk,
		   i==0?&query_pos:&junki, query_seq) != 3) {
	  fprintf(stderr, "blast2col: Error 7 in input\n");
	  return 1; }
	free(s);
	s = GetLine(fp);
	free(s);
	s = GetLine(fp);
	if (sscanf(s, "%s %d %s", junk,
		   i==0?&sbjct_pos:&junki, sbjct_seq) != 3) {
	  fprintf(stderr, "blast2col: Error 8 in input\n");
	  return 1; }
	free(s);
	s = GetLine(fp);
	free(s);
	if (i != numblock-1) {
	  s = GetLine(fp);
	  free(s);
	}

	for (j = 0; query_seq[j] != '\0'; j++, pos++) {
	  full_query[pos]=toupper(query_seq[j]);
	  full_sbjct[pos]=toupper(sbjct_seq[j]);
	}
      }

      ignore_ident = 0;
      for (j = 0; j < align_len; j++) {
	if (full_query[j] == full_sbjct[j])
	  ignore_ident++;
	else {
	  for (i = 0; i < ignore_len; i += 2)
	    if ((strand == 1 && 
		 full_query[j] == toupper(ignore[i]) &&
		 full_sbjct[j] == toupper(ignore[i+1])) ||
		(strand == -1 && 
		 full_query[j] == DNAcomplement(toupper(ignore[i])) &&
		 full_sbjct[j] == DNAcomplement(toupper(ignore[i+1])))) {
	      ignore_ident++;
	      break;
	    }
	}
      }

      output = 1;
      if ((double) ignore_ident/align_len < min_sim ||
	  align_len-ignore_ident > max_mismatch)
	output = 0;

      if (output == 1) {
	printf("; TYPE              DNA_blast\n");
	printf("; COL 1             label\n");
	printf("; COL 2             query_residue\n");
	printf("; COL 3             match\n");
	printf("; COL 4             subject_residue\n");
	printf("; COL 5             query_seqpos\n");
	printf("; COL 6             subject_seqpos\n");
	printf("; ENTRY             %s_vs_%s\n", query, subject);
	printf("; BLAST_VERSION     %s", version);
	printf("; QUERY             %s\n", query);
	printf("; QUERY_LENGTH      %d\n", query_len);
	printf("; SUBJECT           %s\n", subject);
	printf("; SUBJECT_COMMENT   %s\n", subject_comment);
	printf("; SUBJECT_STRAND    %s\n", strand==1?"Plus":"Minus");
	printf("; SUBJECT_LENGTH    %d\n", subject_len);
	printf("; ALIGNMENT_LENGTH  %d\n", align_len);
	printf("; SCORE             %s\n", score);
	if (expect[0] == 'e')
	  printf("; EXPECT            1%s\n", expect);
	else
	  printf("; EXPECT            %s\n", expect);
	printf("; IDENTITIES        %d\n", ident);
	printf("; ----------\n");

	for (j = 0; j < align_len; j++) {
	  printf("N %c  %c  %c", full_query[j],
		 full_query[j]==full_sbjct[j]?'-':'!', full_sbjct[j]);
	  if (full_query[j] != '-') {
	    printf("   %5d", query_pos);
	    query_pos++; }
	  else
	    printf("       .");
	  if (full_sbjct[j] != '-') {
	    printf(" %5d\n", sbjct_pos);
	    sbjct_pos += strand; }
	  else
	    printf("     .\n");
	}

	printf("; **********\n");
      }
      free(full_query);
      free(full_sbjct);
      number++;
    }
  }

  if (fp != stdin && fclose(fp) != 0) {
    fprintf(stderr, "blast 2col: Error in closing file\n");
    return 1; }

  return 0;
}

void usage(void)
{
  fprintf(stderr,
	  "usage: blast2col --expect=MAX_EXPECT\n"
	  "                 --length=MIN_LENGTH (alignment length)\n"
	  "                 --similarity=MIN_SIMILARITY\n"
	  "                 --score=MIN_SCORE\n"
          "                 --top=TOP\n"
	  "                 --mismatch=MAX_MISMATCH\n"
          "                 --alignlen=MIN_ALIGNLEN (alignment length relative\n"
	  "                         to query, e.g 0.5 for 50%% of query hitting\n"
          "                 --ignore=IGNORE (which types of mismatches to ignore)\n"
	  "                             [FILE]\n");
}
