#!/usr/bin/gawk -f




#########################################################################
#  program getseqs / getseqs.awk
#
#  001019 Jan Gorodkin (gorodkin@bioinf.au.dk)
#
#  Copyright (C) 2000 Jan Gorodkin
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful, but
#  WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
#  General Public License for more details.
#
#  You should have received a copy of the GNU General Public License
#  along with this program; if not, write to the Free Software
#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
#  02111-1307, USA.
#
#########################################################################







BEGIN{

 # get time and date
 timedate=strftime("%m%d%H%M%S");
 # get proc id
 getline pid < "/dev/pid";
  
 # misc variables
 exitnow=0;
 seqcount=0;
 gbmaxchunk=20;
 scheme=1;

 # set default vaules
 nseq=25;
 gbseq=50;
 dformat="fasta";
 runname="TMP_getseq."pid"."timedate;
 alength=0;
 crange=100;

 blast="blastcl3 -p blastn -d nr";
 align0="align0";
 qrna="";

 # read options
 for(a=1;a<ARGC+1;a++)
 {
   if(ARGV[a]=="-nseq")
   {
     tmpargv=ARGV[a+1]; gsub(/[^0-9]/,"",tmpargv); nseq=tmpargv;
     if(nseq!=ARGV[a+1])
     { print "Warning: only numbers in nseq is used. nseq="nseq"." > "/dev/stderr";}
     if(match(nseq,/[0-9]/)==0)
     { print "nseq \""ARGV[a+1]"\" contain no numbers (exit)."; exitnow=1; exit; }
     delete ARGV[a]; delete ARGV[a+1];
   } # end nseq

   else if(ARGV[a]=="-colformat"||ARGV[a]=="-col") { dformat="col"; delete ARGV[a];}
   # end dformat

   else if(ARGV[a]=="-runname") { runname=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }

   else if(ARGV[a]=="-blast") {  blast=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }

   else if(ARGV[a]=="-align0") {  align0=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }

   else if(ARGV[a]=="-qrna") {  qrna=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }

   else if(ARGV[a]=="-alength") { alength=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }
   # end alength

   else if(ARGV[a]=="-discgb")
   {
       discgb="cat "ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1];
       while ( ( discgb | getline tmpentry) > 0 )
       { split(tmpentry,tmpentry2," "); discentry[tmpentry2[1]]; }
       close(discgb);
   } # end  discgb

   else if(ARGV[a]=="-crange") { crange=ARGV[a+1]; delete ARGV[a]; delete ARGV[a+1]; }
   # end crange

   else if(ARGV[a]=="-help" || ARGV[a]=="-")
   {
     print "Usage:   getseqs [options] <file>";
     print "This program uses: Lynx Version 2.8.3dev.9. You must install this program";
     print "to get getseqs to work (is standard on must linux systems).";
     print "Options:";
     print "-nseq <number> Makes the blast search using nseq sequences at the time.";
     print "               Default is 25.";
     print "-colformat     Read sequences column (col) format instead of fasta format.";
     print "               Default is fasta format.";
     print "-col           The same as \"-colformat\".";
     print "-runname <string> The name of temporary data dir. By default it combines date, ";
     print "               time and process id, to create a unique identifier. If \"runname\"";
     print "               exists the extension of time will be added prior to making new";
     print "               \"runname\" dir.";
     print "-blast <'string'> The blast commandline execution.";
     print "               Default is ""'""blastcl3 -p blastn -d nr""'"".";
     print "               Data is piped to this command.";
     print "               Note that even the netblast execution, blastcl3, can be replaced";
     print "               with your local version of blast and even a complete path to that";
     print "               executable.This program must be installed locally in order to be";
     print "               used by getseqs.";
     print "-align0 <'string'> The align0 commandline executable. Default is ""'""align0""'"".";
     print "               To turn align0 usage off, use -align0 ""'""""'"".";
     print "               The command is executed on query and subject data files. This";
     print "               program must be installed locally in order to be used by getseqs.";
     print "-qrna <'string'> The qrna commandline execution. Default is -qrna ""'""""'"", that";
     print "               the program is not used by default. (se man page for details).";
     print "               To turn qrna usage off, use -qrna ""'""""'"". This program must be";
     print "               installed locally in order to be used by getseqs.";
     print "-alength <number> Filter the blast search by minimum allowable alignment length.";
     print "               Deafult is zero.";
     print "-discgb <file> File containing the list of (GenBank) entries to be discarded";
     print "               from the blast search.";
     print "-crange <number> The sequence context range to extend GenBank hit with. The";
     print "               extension is in both directions. Default is 100, but its";
     print "               recommend that size is of the size of the search sequence."
     print "-help (or \"-\") Show this list.";
     exitnow=1; exit;
   } # end help

 }


} # BEGIN -- done ...


# READ sequences in fasta format
toupper(dformat)=="FASTA"{
 if(substr($1,1,1)==">") { seqcount++; seqname[seqcount]=$0; seq[seqcount]=""; }
 else { seq[seqcount]=seq[seqcount]"\n"$1; }
}


# READ sequences in col format.
toupper(dformat)=="COL"{
 if(toupper($2)=="TYPE") type=toupper($3);
 if(toupper($2)=="COL"&&toupper($4)=="LABEL") label=$3;
 if(toupper($2)=="COL"&&toupper($4)=="NUCLEOTIDE") nucleotide=$3;
 if(toupper($2)=="ENTRY"&&type=="RNA") { seqcount++; seqname[seqcount]=">"$3; }
 if(length(seq[seqcount])%60==0) separator="\n"; else separator="";
 if(toupper($label)=="N") seq[seqcount]=seq[seqcount]""separator""$nucleotide;
}


# Start processing the sequences
END{if(exitnow==0){

  # make datafile
  ("ls -1d "runname)" 2>& 1" | getline tmpline;
  if(tmpline==runname) system("mv "runname" "runname"."timedate);
  system("mkdir "runname);
  system("mkdir "runname"/fasta");
  dfcount=1;
  for(s=1;s<seqcount+1;s++)
  {
     datafile[dfcount] = runname"/fasta/seqset"dfcount".fasta";
     singlefasta=runname"/fasta/seq."substr(seqname[s],2)".fasta";
     print seqname[s]""seq[s] >> datafile[dfcount]; close(datafile[dfcount]);
     if(s%nseq==0) { if(s<seqcount) dfcount++; }
     print seqname[s]""seq[s] >> singlefasta;  close(singlefasta);
  }



  # Do BLASTNET now
  blastfile=runname"/search.blast";
  netblast=blast;
  for(i=1;i<dfcount+1;i++)
  {
     runblast="cat "datafile[i]" | "netblast;
     blastout=0;
     while( (runblast | getline tmpline) > 0 )
     {
        if(substr(tmpline,1,6)=="BLASTN") blastout=1;
        if(blastout==1) print tmpline >> blastfile; close(blastfile);
     }
     close(runblast);  # yeah yeah yeah it can be done in parallel...
  }



  # Discard GenBank entries listed
  # do blast2col eetc...
  blast2col="blast2col  --length="alength" "blastfile;
  print "; Filtered blast search in column format"   > blastfile".col";
  print "; ========================================================================" >> blastfile".col";
  close(blastfile".col");

  k=1;
  while( ( blast2col | getline tmpline) > 0 )
  {
     n=split(tmpline,tmps," ");
     if(toupper(tmps[2])=="TYPE"){ k=1; }
     if(toupper(tmps[2])=="SUBJECT")
     {
        m=split(tmps[n],tmpr,"|");  # assume m<= 3..
        wout=1;
        if(tmpr[2] in discentry) { wout=0; }
        if(wout==1) gblist[tmpr[2]];
     }

     bline[k++]=tmpline;

     if(index(tmpline,"**********")>0&&wout==1)
     { for(l=1;l<k;l++) print bline[l] >> blastfile".col"; close(blastfile".col"); }
  }
  close(blast2col);


  # get GenBank sequences now!
  gbcount=0;
  tgbcount=gbmaxchunk-1;
  for(i in gblist)
  {
     if(tgbcount==gbmaxchunk-1) { tgbcount=0; gbcount++; gbchunk[gbcount]=i; }
     else { gbchunk[gbcount]=gbchunk[gbcount]"+"i; tgbcount++; }
  }

  gbfile=runname"/entries.gb";
  getgene1="lynx -dump 'http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=Search&db=Nucleotide&term=";
  getgene2="&doptcmdl=GenBank&dispmax="gbmaxchunk"'";
  for(i=1;i<gbcount+1;i++)
  { 
     #print "EEE   "gbchunk[i];
     getgene=getgene1""gbchunk[i]""getgene2;
     gbout=0;
     while( (getgene | getline tmpline) > 0 )
     {
       if(toupper(substr(tmpline,1,5))=="LOCUS"){gbout=1;}
       if(gbout==1) print tmpline >> gbfile; close(gbfile);
       if(substr(tmpline,1,2)=="//"){gbout=0;}
     }
     close(getgene);
  }



  # Extract regions of gb hits to align0 and qrna against indivual seq's.
  getextlist="extendlist -c0 "blastfile".col  ";
  while( (getextlist | getline tmpline) > 0 )
  {
     n=split(tmpline,tmps," ");
     gsub("^[^\\|]*\\|", "", tmps[5]);
     gsub("\\..*\\|[^\\|]*$", "", tmps[5]);
     gb=tmps[5];
     ks[gb]++;

     seq[ks[gb],gb]=tmps[1];
     qrna_start[ks[gb],gb]=tmps[6];
     qrna_end[ks[gb],gb]=tmps[7];
     strand[ks[gb],gb]=tmps[8];
     start[ks[gb],gb]=qrna_start[ks[gb],gb]-crange;
     end[ks[gb],gb]=qrna_end[ks[gb],gb]+crange;
  }
  close(getextlist);

  # go through the genbank file and do what needs to be done for each entry.
  getgb="cat "gbfile;
  while( (getgb | getline tmpline) > 0 )
  {
     n=split(tmpline,tmps," ");
     if(toupper(substr(tmpline,1,9))=="ACCESSION"||toupper(substr(tmpline,1,5))=="LOCUS") 
     { gb=tmps[2]; tmpseq=""; wout=0; }
     if(substr(tmpline,1,2)=="//")
     {
        for(r=1;r<ks[gb]+1;r++)
        {
           # NOW EXECUTE PROGRAMS 
           if(align0!="")
           {
              # make fasta file of gb entry suitable for align0 comparison 
              fastaalign0=runname"/fasta/"gb"_align0."r"."seq[r,gb]".fasta";
              print ">"gb >> fastaalign0;
              outseq=grepgbseq(tmpseq,start[r,gb],end[r,gb],strand[r,gb]);
              tlen=length(outseq);
              for(i=1;i<tlen+1;i+=60) print substr(outseq,i,60) >> fastaalign0;
              close(fastaalign0);

              #getalign0=(align0" "runname"/fasta/seq."seq[r,gb]".fasta  "fastaalign0)" 2>& 1";
              getalign0=align0" "runname"/fasta/seq."seq[r,gb]".fasta  "fastaalign0;
              while( (getalign0 | getline newtmpline) > 0 )
              { print newtmpline >> runname"/align0.out"; close(runname"/align0.out"); }
              close(getalign0);
           }

           if(qrna!="")
           {
             # make fasta file of gb entry suitable for qrna comparison
             fastaqrna=runname"/fasta/"gb"_qrna."r"."seq[r,gb]".fasta";
             print ">"gb >> fastaqrna;
             outseq=grepgbseq(tmpseq,qrna_start[r,gb],qrna_end[r,gb],strand[r,gb]);
             tlen=length(outseq);
             for(i=1;i<tlen+1;i+=60) print substr(outseq,i,60) >> fastaqrna;
             close(fastaqrna);

             getqrna = qrna" "runname"/fasta/seq."seq[r,gb]".fasta  "fastaqrna;
             while( (getqrna | getline newtmpline) > 0 )
             { print newtmpline >> runname"/qrna.out"; close(runname"/qrna.out"); }
             close(getqrna);
           }
        }
        wout=0;
     }
     if(wout==1) { tseq=substr(tmpline,10); gsub(" ","",tseq); tmpseq=tmpseq""tseq; }
     if(toupper(substr(tmpline,1,6))=="ORIGIN") { wout=1; }
  }
  close(getgb);





}}




function grepgbseq(gbtmpseq,gbstartpos,gbendpos,gbstrandtype) {

  tmpseqregion=substr(gbtmpseq,gbstartpos,gbendpos-gbstartpos+1);
  gsub(/[^ACGUTacgut]/,"X",tmpseqregion);
  tmpgblen=length(tmpseqregion);

  gboutseq="";
  if(toupper(gbstrandtype)=="MINUS")
  {
      gsub(/[Aa]/,"1",tmpseqregion); gsub(/[Cc]/,"2",tmpseqregion); gsub(/[Gg]/,"3",tmpseqregion); gsub(/[TtUu]/,"4",tmpseqregion);
      gsub("1","T",tmpseqregion);    gsub("2","G",tmpseqregion);    gsub("3","C",tmpseqregion);    gsub("4","A",tmpseqregion);
      for(i=1;i<tmpgblen+1;i++){ outtmp=substr(tmpseqregion,i,1); gboutseq=outtmp""gboutseq; }
      # can be done more elegant...
  }
  else gboutseq=tmpseqregion;

  return gboutseq;
}













############ end of file #############



