package EST2ncRNA::Annotation;

use File::Copy qw(cp);

=head1 NAME

C<EST2ncRNA::Annotation> - Several annotation steps of non-coding RNA candidates

=head1 SYNOPSIS

    use EST2ncRNA::Annotation;

    $server = eval { new EST2ncRNA::ServerInterface($SERVER, $WORKDIR, $QUEUE, $NODES, $WALLT); } or die ($@);
    $server->prepare_dir_tree;
    $annotation = eval { new EST2ncRNA::Annotation($server); } or die ($@);

    $annotation->tRNAscan_SE($fastafile, $resultfile, $secstrfile, $statisticfile);
    $annotation->rnamicrohome($RNAmicro_execution_folder);
    $annotation->RNAmicro($aln_folder, \@aln_files, $output_file_plus_strand, $output_file_minus_strand);
    $annotation->snoreport($snoRNA_execution_folder);
    $annotation->snoRNA($aln_folder, \@aln_files, $output_file_plus_strand, $output_file_minus_strand);

=cut

use strict;
use warnings;

require Exporter;

our @ISA = qw(Exporter);  # inherits from Exporter

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(
		   );

our @EXPORT = qw();

our $VERSION = '0.01';


=head1 METHODS

=head2 B<new>

Creates a new EST2ncRNA::Annotation class instance
    Input  - C<server instance>
    Output - EST2ncRNA::Annotation class instance

=cut

sub new {
    my $class = shift @_;
    my ($workdir,$server) = @_;

    my $self = {
	_subdir         	=> $workdir."/est2ncrna.annotation",
	_server         	=> $server,
	_rnamicrohome   	=> undef,
	_rnamicromodeldir	=> undef,
	_snoreporthome     	=> undef,
	_snoreportmodeldir	=> undef
    };
    bless $self, $class;

    # create subdir if not already exists
    mkdir $self->{_subdir} unless -d $self->{_subdir};

    return $self;
}


=head2 B<subdir>

Accessor method for Annotation subdir

=cut

sub subdir {
    my($self) = @_;	
    return $self->{_subdir};
}


=head2 B<server>

Accessor method for Annotation server-interface

=cut

sub server {
    my($self) = @_;
    
    return $self->{_server};
}


=head2 B<rnamicrohome>

Accessor method for Annotation rnamicrohome (executable directory of RNAmicro)

=cut

sub rnamicrohome {
    my($self, $rnamicrohome) = @_;
    $self->{_rnamicrohome} = $rnamicrohome if defined($rnamicrohome);
    return $self->{_rnamicrohome};
}


=head2 B<rnamicromodeldir>

Accessor method for Annotation rnamicromodeldir (model directory of RNAmicro)

=cut

sub rnamicromodeldir {
    my($self, $rnamicromodeldir) = @_;
    $self->{_rnamicromodeldir} = $rnamicromodeldir if defined($rnamicromodeldir);
    return $self->{_rnamicromodeldir};
}

=head2 B<snoreporthome>

Accessor method for Annotation snoreporthome (executable directory of snoReport)

=cut

sub snoreporthome {
    my($self, $snoreporthome) = @_;
    $self->{_snoreporthome} = $snoreporthome if defined($snoreporthome);
    return $self->{_snoreporthome};
}


=head2 B<snoreportmodeldir>

Accessor method for Annotation snoreportmodeldir (model directory of snoReport)

=cut

sub snoreportmodeldir {
    my($self, $snoreportmodeldir) = @_;
    $self->{_snoreportmodeldir} = $snoreportmodeldir if defined($snoreportmodeldir);
    return $self->{_snoreportmodeldir};
}


=head2 B<tRNAscan_SE>

Scan a sequence file for tRNAs using tRNAscan, EufindtRNA & tRNA covariance models
    Input  - (1) the query sequences as gzipped fasta file
             (2) file saving final results (optional)
             (3) file saving tRNA secondary structures (optional)
             (4) file saving save statistics summary for run (optional)
    Output - a reference to a hash including as key the sequence ID and as value the annotation text
    B<REQUISIT:> the tool C<tRNAscan-SE> in local PATH

    Lowe, T.M. & Eddy, S.R. (1997) "tRNAscan-SE: A program for
    improved detection of transfer RNA genes in genomic sequence"
    Nucl. Acids Res. 25: 955-964.

=cut

sub tRNAscan_SE {
    my ($self, $fastafile, $resultfile, $secstrfile, $statisticfile) = @_;
    my (@line, %anno);

    my $SUBDIR = $self->subdir;
    $resultfile = "$SUBDIR/tRNA.result" unless defined $resultfile;
    $secstrfile = "$SUBDIR/tRNA.struct" unless defined $secstrfile;
    $statisticfile = "$SUBDIR/tRNA.stat" unless defined $statisticfile;

    unlink $resultfile;
    unlink $secstrfile;
    unlink $statisticfile;

    `gunzip -f $fastafile`;
    $fastafile =~ s/\.gz//;
    `tRNAscan-SE -o $resultfile -f $secstrfile -m $statisticfile $fastafile 2> /dev/null`;
        
    # fill hash with key=id and value=annotation extracted from $resultfile
    open IN, $resultfile || die("Can not open the file!\n");
    while( <IN> ) {
	if( /^(\d+)/ ) {
	    @line = split " ";
	    $anno{$line[0]} = "tRNA ".$line[4];
	}
    }
    close IN;

    return \%anno;
}


=head2 B<db_hits>

Scan blast results prepared with C<kvlblast2table> for annotation and search extra informations in the database file if specified. It takes the blasthit with the lowest e-value and a identity of at least 70nts
    Input  - (1) blast result file in table format (full path)
             (2) database file (optional)
    Output - a reference to a hash including as key the sequence ID and as value the annotation text

=cut

sub db_hits {
    my ($self, $blasttable, $db) = @_;
    my (@line, %tmp, %anno);

    open IN, $blasttable || die("Can not open the file!\n");
    <IN>;
    while( <IN> ) {
	@line = split " ";
	unless( defined $anno{$line[0]} ) {
	    $tmp{$line[1]} = $line[0] if $line[7]>=70;
	    $anno{$line[0]} = $line[1] if $line[7]>=70;
	}
    }
    close IN;

    if( defined $db ) {
	open IN, "zcat $db |" || die("Can not open the file!\n");
	while( <IN> ) {
	    if( /^>/ ) {
		chomp($_);
		$_ =~ s/>//;
		@line = split " ";
		$anno{$tmp{$line[0]}} = $_ if defined $tmp{$line[0]};
	    }
	}
	close IN;
    }

    return \%anno;
}


=head2 B<RNAmicro>

Runs RNAmicro to find microRNAs.
    Input  - (1) the folder name where the aln-files lie
             (2) refence to an array including all input alignment filenames
             (3) optional the filename of the output for '+' strand prediction
	     (4) optional (if (3)) the filename of the output for '-' strand prediction
    Output - a reference to a hash including as key the sequence ID and as value the annotation text (start_index end_index strand pvalue) of all microRNA structures located in the EST seperated by C<:>
             
    B<REQUISIT:> the tool C<RNAmicro> has to installed on the server and the PATH has to published in the C<rnamicrohome> attribute of Annotation

=cut

sub RNAmicro {
    my ($self, $aln_d, $alns_ref, $plusfile, $minusfile) = @_;
    my ($w_size, $tmp, $est, $mirna_refhash, %microrna, $cmd);

    my $SUBDIR = $self->subdir;
    my $HOME = $self->server->home;
    my $SERVER = $self->server->name;
    my $USERID = $self->server->userid;
    my $RNAMICROHOME = $self->rnamicrohome;
    my $RNAMICROMODELDIR = $self->rnamicromodeldir;
    my $PBSUBMIT = $self->server->pbsubmit;
    my $QUEUE = $self->server->queue;
    my $WALLT = $self->server->wallt;
    my $OUTPUTPLUS = ( defined $plusfile ) ? $plusfile : "rnamicro.cand.plus.out";
    my $OUTPUTMINUS = ( defined $minusfile ) ? $minusfile : "rnamicro.cand.minus.out";
    $self->server->sshcall("rm -f $HOME/$SUBDIR/$OUTPUTPLUS") unless defined $plusfile;
    $self->server->sshcall("rm -f $HOME/$SUBDIR/$OUTPUTMINUS") unless defined $minusfile ;

    # run RNAmicro 3 times for all aln-files in $aln_d
    open OUT, ">$SUBDIR/pbs_rnamicro.sh" || die("Can not open the file!\n");
    print OUT "#!/bin/tcsh\n set path = ( $PBSUBMIT \$path )\n setenv MODELDIR $RNAMICROMODELDIR\n cd $RNAMICROHOME\n";

    for( $w_size=70; $w_size<=130; $w_size+=30 ) {

	foreach my $a (@$alns_ref) {
        	if($a ne "." && $a ne ".." && $a =~ /\.aln/) {
                	my $aln_f = $aln_d."/".$a;

                	$cmd = "$RNAMICROHOME/RNAmicro -w $w_size -i $aln_f -s >> $HOME/$SUBDIR/$OUTPUTPLUS";
			print OUT qq{$cmd\n};
                    	$cmd = "$RNAMICROHOME/RNAmicro -r -w $w_size -i $aln_f -s >> $HOME/$SUBDIR/$OUTPUTMINUS";
                   	print OUT qq{$cmd\n};
                }
	}
    }

    close OUT;

    # run RNAmicro on server
    $self->server->scpcall("$SUBDIR/pbs_rnamicro.sh","$SERVER:$HOME/$SUBDIR");
    $self->server->sshcall("cd $HOME/$SUBDIR; chmod u+x pbs_rnamicro.sh");
    `ssh $SERVER '$PBSUBMIT -o "-N rnamicro -l walltime=$WALLT" $QUEUE -D -Q -B $HOME -c "$HOME/$SUBDIR/pbs_rnamicro.sh"'`;
   
    # wait until all RNAmicro processes of user 'whoami' are finished
    $self->server->wait("rnamicro");	

    # copy output from server to home
    $self->server->scpcall("$SERVER:$HOME/$SUBDIR/$OUTPUTPLUS", "$SUBDIR");
    $self->server->scpcall("$SERVER:$HOME/$SUBDIR/$OUTPUTMINUS", "$SUBDIR");

    # analyze RNAmicro output for '+' strand
    $mirna_refhash = $self->analyze_rnamicro_output($OUTPUTPLUS, "+");
    # and generate output
    foreach $est ( keys %$mirna_refhash ) {
        foreach( @{$$mirna_refhash{$est}} ) {
                $tmp="$$_[0] $$_[1] $$_[2] $$_[3]";
                $microrna{$est} = (defined $microrna{$est}) ? $microrna{$est}.":".$tmp : $tmp;
        }
    }
    # analyze RNAmicro output for '-' strand
    $mirna_refhash = $self->analyze_rnamicro_output($OUTPUTMINUS, "-");
    # and generate output
    foreach $est ( keys %$mirna_refhash ) {
        foreach( @{$$mirna_refhash{$est}} ) {
                $tmp="$$_[0] $$_[1] $$_[2] $$_[3]";
                $microrna{$est} = (defined $microrna{$est}) ? $microrna{$est}.":".$tmp : $tmp;
        }
    }    

    return \%microrna;
}


sub analyze_rnamicro_output {
    my ($self, $output, $strand) = @_;
    my (@line, $key, %mirna, $entry, $wstart, $wend, $new);

    my $SUBDIR = $self->subdir;

    open IN, "$SUBDIR/$output" || die("Can not open the file!\n");
    while( <IN> ) {
        @line = split " ";
	# parse f:* t:* and change start index from 0-based to 1-based
	$wstart=(split ":",$line[15])[1]+1;
	$wend=(split ":",$line[16])[1];
	$new=1;

        if( $line[0]==1 && $line[1]>=0.5 ) {
            $key = ( split '\.' , ( reverse( split "\/", $line[18] ) )[0] )[0];
            if( defined $mirna{$key} ) {
                foreach( @{$mirna{$key}} ) {

                        if( $wstart>$$_[0] && $wstart<=$$_[1] ) {
                                $$_[1] = $wend if $$_[1]<$wend;
                                $$_[3] = $line[1] if $$_[3]<$line[1];
				$new=0;
                        }
                        # if new hit end lies inside an already registered hit
                        elsif( $wend>=$$_[0] && $wend<$$_[1] ) {
                                $$_[0] = $wstart if $$_[0]<$wstart;
                                $$_[3] = $line[1] if $$_[3]<$line[1];
				$new=0;
                        }
                        # if new hit start and end lie around already registered hit
                        elsif( $wstart<=$$_[0] && $wend>=$$_[1]) {
                                $$_[0] = $wstart;
                                $$_[1] = $wend;
                                $$_[3] = $line[1] if $$_[3]<$line[1];
				$new=0;
                        }
		}

           	# if new hit lies on another EST position as the already registered ones (microRNA cluster)
                if($new) {
                 	$entry = [$wstart, $wend, $strand, $line[1]];
                        push (@{$mirna{$key}}, $entry);
                }
            }
            else {
                # create an anonymous array for (start, end, strand, pvalue)
                $entry = [$wstart, $wend, $strand, $line[1]];
                # add this to the $key entry of the mirna-hash
                push (@{$mirna{$key}}, $entry);
            }
        }
    }
    close IN;

    return \%mirna; 
}


=head2 B<snoReport>

Runs snoReport to find snoRNAs with p-value>0.5
    Input  - (1) the folder name where the aln-files lie
             (2) refence to an array including all input alignment filenames
             (3) optional the filename of the output for '+' strand prediction
             (4) optional (if (3)) the filename of the output for '-' strand prediction
 
    Output -  a reference to a hash holding all predicted snoRNAs,
	      it includes as key the sequence ID and as value an anonymous array with EST-strand and pvalue
    B<REQUISIT:> the tool C<snoReport> has to installed on the server and the PATH has to published in the C<snoreport> attribute of Annotation

=cut

sub snoReport {
    my ($self, $aln_d, $alns_ref, $plusfile, $minusfile) = @_;
    my (@line, $file, %snorna, $estid);

    my $SUBDIR = $self->subdir;
    my $SERVER = $self->server->name;
    my $USERID = $self->server->userid;
    my $HOME = $self->server->home;
    my $PBSUBMIT = $self->server->pbsubmit;
    my $QUEUE = $self->server->queue;
    my $WALLT = $self->server->wallt;
    my $SNOREPORTHOME = $self->snoreporthome;
    my $SNOREPORTMODELDIR = $self->snoreportmodeldir;

    my $OUTPUTPLUS = ( defined $plusfile ) ? $plusfile : "snoreport.cand.plus.out";
    #my $OUTPUTMINUS = ( defined $minusfile ) ? $minusfile : "snoreport.cand.minus.out";
    $self->server->sshcall("rm -f $HOME/$SUBDIR/$OUTPUTPLUS") unless defined $plusfile;
    #$self->server->sshcall("rm -f $HOME/$SUBDIR/$OUTPUTMINUS");

    # run snoRNA for all aln-files in $aln_d
    open OUT, ">$SUBDIR/pbs_snoreport.sh" || die("Can not open the file!\n");
    print OUT "#!/bin/tcsh\n set path = ( $PBSUBMIT \$path )\n cd $SNOREPORTHOME\n";

    foreach my $a (@$alns_ref) {
    	if($a ne "." && $a ne ".." && $a =~ /\.aln/) {
        	my $aln_f = $aln_d."/".$a;

               	my $cmd = "$SNOREPORTHOME/snoReport -i $aln_f -m $SNOREPORTMODELDIR -a >> $HOME/$SUBDIR/$OUTPUTPLUS";
               	print OUT "$cmd\n";
               	#$cmd = "$SNOREPORTHOME/snoReport -r -i $aln_f -m $SNOREPORTMODELDIR -a >> $HOME/$SUBDIR/$OUTPUTMINUS";
               	#print OUT "$cmd\n";
	}
    }

    close OUT;

    # run snoRNA on server
    $self->server->scpcall("$SUBDIR/pbs_snoreport.sh","$SERVER:$HOME/$SUBDIR");
    $self->server->sshcall("cd $HOME/$SUBDIR; chmod u+x pbs_snoreport.sh");
    `ssh $SERVER '$PBSUBMIT -o "-N snoreport -l walltime=$WALLT" $QUEUE -D -Q -B $HOME -c "$HOME/$SUBDIR/pbs_snoreport.sh"'`;

    # wait until all snoReport processes of user 'whoami' are finished
    $self->server->wait("snoreport");

    # copy output from server to home
    $self->server->scpcall("$SERVER:$HOME/$SUBDIR/$OUTPUTPLUS", "$SUBDIR");
    #$self->server->scpcall("$SERVER:$HOME/$SUBDIR/$OUTPUTMINUS", "$SUBDIR");

    # read the output and save all ESTs predicted as snoRNA with p-value>0.5 in a hash (key=estid, value=array including pvalue and strand)
    #for $file ( "$SUBDIR/$OUTPUTPLUS", "$SUBDIR/$OUTPUTMINUS" ) {
	$file = "$SUBDIR/$OUTPUTPLUS";
    	#$strand = ( $file eq "$SUBDIR/$OUTPUTPLUS" ) ? "+" : "-";
    	open IN, $file || die("Can not open the file!\n");
    	while( <IN> ) {
		next unless( /^CD/ || /^HACA/ );
        	@line = split " ";
		if( $line[1] == 1 ) {
			$estid = ( reverse( split '\/', $line[20] ) )[0];
			$estid = ( split '\.', $estid )[0];
			# if EST is already predicted as snoRNA on the actual strand through another alignment then store the higher pvalue
			if( defined $snorna{$estid} ) {
				if( ${$snorna{$estid}}[1] eq $line[21] ) {
					next if $line[2]<=${$snorna{$estid}}[0]
				}
			}
			$snorna{$estid} = [$line[2], $line[21], $line[0]];
		}
    	}
    	close IN;
    #}

    return \%snorna;
}


sub snoReport2 {
    my ($self, $aln_d) = @_;
    my ($w_size, @line, %anno, $key, $tmp, $entry, %snorna, %snorna2, $est, $hit, $changed, @tmp, @val);

    my $SUBDIR = $self->subdir;
    my $HOME = $self->server->home;
    my $SNOWREPORT = $self->snoreport;
    my $SNOW = $self->sno;
    my $OUTPUT = "snoreport.cand.out";
    
    # run snoReport 3 times for all aln-files in $aln_d 
    unlink "$HOME/$SUBDIR/$OUTPUT";
    for( $w_size=70; $w_size<=130; $w_size+=30 ) {
	
	opendir(A, $aln_d) or die $!; 
	my @alns = readdir(A); 
	closedir(A);

	open OUT, ">$SUBDIR/pbs_snoreport.sh" || die("Can not open the file!\n");
	print OUT "#!/bin/bash\n export SNOW=$SNOW\n export PATH=$SNOWREPORT:\$PATH\n";

	foreach my $a (@alns) {
	    if($a ne "." && $a ne ".." && $a =~ /\.aln/) {
		my $aln_f = $aln_d.$a;
	
		if(-f $aln_f) {
		    my $cmd = "snoReport -d -i -w $w_size $aln_f >> $HOME/$SUBDIR/$OUTPUT";
		    print OUT "$cmd\n";
		}
	    }
	}

	close OUT;
	$self->server->sshcall("chmod u+x $SUBDIR/pbs_snoreport.sh; $SUBDIR/pbs_snoreport.sh");
    }

    # read the output and save for each EST-key each independent snoRNA locations with p-value>0.9 in an array
    # %snorna: key=EST-id, value=pointer to an array
    # this array holds pointers to all independent locations represented as an array with entries startpos., endpos., pvalue
    open IN, "$HOME/$SUBDIR/$OUTPUT" || die("Can not open the file!\n");
    while( <IN> ) {
	@line = split " ";
	$changed = 0;

	if( $line[0]==1 ) {
	    if( $line[1]>=0.5 ) {
		$line[12] =~ s/from\://;
		$line[13] =~ s/to\://;	
		$key = ( split '\.' , ( reverse( split "\/", $line[14] ) )[0] )[0];
		if( defined $snorna{$key} ) {
		    foreach( @{$snorna{$key}} ) {
			# if new hit start lies inside an already registered hit
			if( $line[12]>$$_[0] && $line[12]<=$$_[1] ) {
			    #print "new hit start lies inside\n";
			    $$_[1] = $line[13] if $$_[1]<$line[13];
			    $$_[2] = $line[1] if $$_[2]<$line[1];
			    $changed = 1;
			    last;
			}
			# if new hit end lies inside an already registered hit
			elsif( $line[13]>=$$_[0] && $line[13]<$$_[1] ) {
			    #print "new hit end lies inside\n";
			    $$_[0] = $line[12] if $$_[0]<$line[12];
			    $$_[2] = $line[1] if $$_[2]<$line[1];
			    $changed = 1;
			    last;
			}
			# if new hit start and end lie around already registered hit
			elsif( $line[12]<=$$_[0] && $line[13]>=$$_[1] ) {
			    #print "new hit start and end lie around\n";
			    $$_[0] = $line[12];
			    $$_[1] = $line[13];
			    $$_[2] = $line[1] if $$_[2]<$line[1];
			    $changed = 1;
			    last;
			}
		    }
		    if( !$changed ) {
			# if new hit lies on another EST position as the already registered ones (microRNA cluster)
			$entry = [$line[12], $line[13], $line[1]];
			push (@{$snorna{$key}}, $entry);
			#print "hej: $key $line[12] $line[13] $line[1]\n";
		    }
		}
		else {
		    # create an anonymous array for (start, end, pvalue)
		    $entry = [$line[12], $line[13], $line[1]];
		    # add this to the $key entry of the snorna-hash
		    push (@{$snorna{$key}}, $entry); 
		    #print "hallo: $key $line[12] $line[13] $line[1]\n";
		}
	    }
	}
    }
    close IN;

    # second output extracting step
    # because in the first step the snoRNA locations are expanded its possible that locations still overlap
    foreach $est ( keys %snorna ) {
	foreach $hit ( @{$snorna{$est}} ) {
	    if( defined $snorna2{$est} ) {
		$changed = 0;
		foreach( @{$snorna2{$est}} ) {
		    if( $$hit[0]<=$$_[1] && $$hit[1]>$$_[1]) {
			$$_[1] = $$hit[1];
			$$_[2] = $$hit[2] if $$_[2]<$$hit[2];
			$changed = 1;
			last;
		    }
		}
		if( !$changed ) {
		    $entry = [$$hit[0], $$hit[1], $$hit[2]];
		    push (@{$snorna2{$est}}, $entry);
		}
	    }
	    else {
		$entry = [$$hit[0], $$hit[1], $$hit[2]];
		push (@{$snorna2{$est}}, $entry);
	    }
	}
    }


    foreach $est ( keys %snorna2 ) {
	foreach( @{$snorna2{$est}} ) {
    		$tmp="$$_[0] $$_[1] $$_[2]";
		$snorna{$est} = (defined $snorna{$est}) ? $snorna{$est}.":".$tmp : $tmp;
	}
    }

    return \%snorna;
}


=head2 B<prepare_aln_files>

=cut

sub prepare_aln_files {
    my ($self, $OLDALNSUBDIR, $alndir) = @_;
    my (@tar);

    my $SERVER = $self->server->name;
    my $SERVER_HOME = $self->server->home;
    my $SUBDIR = $self->subdir;
    my $USERID = $self->server->userid;

    # transfer a tar-archive of all pbs scripts to the running machine
    system(qq{cd $OLDALNSUBDIR; tar -vzcf clustalw.tar.gz $alndir > /dev/null});
    $self->server->scpcall("$OLDALNSUBDIR/clustalw.tar.gz", "$SERVER:$SERVER_HOME/$SUBDIR");
    $self->server->sshcall("cd $SERVER_HOME/$SUBDIR; tar xzvf clustalw.tar.gz > /dev/null");
    # wait until tar process of user 'whoami' is finished
    while(1) {
        @tar = `ssh $SERVER "ps u -C tar | grep $USERID"`;
        last if @tar==0;
        sleep 60;
    }
}


=head2 B<prepare_loci_aln_files>

=cut

sub prepare_loci_aln_files {
	my ($self, $est_loci, $rnazwindowfile, $maffile);

}


=head2 B<cand_homologous_seq>

=cut

sub cand_homologous_seq {
   	my ($self, $org, $queryFile) = @_;
	my (@line, %homotmp, %homo, $status, $key, @target, $extend, $hstart, $hend, $estid, $hstrand, $eststart, $estend, %eststrand, $strandswitch);

	open IN, $queryFile || die("Can not open the file!\n");
        while( <IN> ) {
                @line = split " ", $_;
		if( @line==8 ) {
                	$homotmp{$line[1].":".($line[2]+1)} = $line[3];
                        $homo{$line[1].":".($line[2]+1).":".$line[3]} = 0;
                }
      	}
        close IN;

	#get the more reliable strand of EST for conserved secondary structure (RNAstrand output)
	$queryFile =~ s/results\.dat/strand\.txt/;
	open IN, $queryFile || die("Can not open the file!\n");
        while( <IN> ) {
		#EST-id \t loci-start \t strand
		@line = split " ";
		$eststrand{$line[0].":".$line[1]} = $line[2];
	}
	close IN;

        #get homologous subsequence from loci through RNAz output file
        $queryFile =~ s/strand\.txt/out/;
        $status = 0;
        $extend = 0;
        $estid = 0;
        $eststart = 0;
        $estend = 0;
        open IN, $queryFile || die("Can not open the file!\n");
        while( <IN> ) {
        	$status = 0 if /RNAz 1\.0pre/;
                next unless /^>/;
                next if /^>consensus/;
                @line = split " ";
                $line[0] =~ s/>//;
                if( !$status ) {
                	$extend = 0 if( $line[0] ne $estid || !($line[1]+1>=$eststart && $line[1]+1<=$estend) );
                        $eststart = $line[1]+1;
                        $estend = $line[1]+$line[2];
                        if( !$extend ) {
                        	next unless defined $homotmp{$line[0].":".($line[1]+1)};
                                $key = $line[0].":".($line[1]+1).":".$homotmp{$line[0].":".($line[1]+1)};
                                $estid = $line[0];
				$strandswitch = ($line[3] eq $eststrand{$line[0].":".($line[1]+1)}) ? 0 : 1;
                       	}
                        $status = 1;
        	}
		else {
                	@target = split /\|/, $line[0];
                        next if $target[1] ne $org;
                        if( !$extend ) {
				if( $strandswitch ) {
					$hstrand = ($line[3] eq "+") ? "-" : "+"; 
				}
				else {
					$hstrand = $line[3];
				}
				$hstart = $line[1]+1;
				$hend = $line[1]+$line[2];
                        	$homo{$key} = $target[2].":".$hstart.":".$hend.":".$hstrand;
                                $extend = 1;
               		}
                        else {
                        	#extend the homologous organism alignment if possible
                                if( $line[1]+1>=$hstart && $line[1]+1<=$hend ) {
                                	$homo{$key} = $target[2].":".$hstart.":".($line[1]+$line[2]).":".$hstrand;
                         	}
                                else {
                                	$extend = 0;
                            	}
                   	}
            	}
	}
        close IN;

        return \%homo;
}

=head2 B<annotUTRs>

Annotate the input ESTs as 5'- or 3'-UTRs using the known genes of a homologous organism
    Input  - (1) text-file annotating all known genes of the homologous organism
	     (2) reference to a hash including as key the unique identifier of an EST (id:start:end) and as value the chromosome, start, end and strand of the aligned sequence of the homologous organism seperated by ":"
	     (3) boolean if 5'-UTR is searched (1) or 3'-UTR (0)
    Output -  a reference to a hash including as key the unique identifier of an EST (like input) inside an UTR and as value the corresponding gene name

=cut

sub annotUTRs{
	my ($self, $knownGene, $homo_hashref, $utr5) = @_;
	my (@line, @end, @start, $i, $end, %utr, $est, @homo, %cchr, %cstart, %cend, %cstrand, $chr, $exon, %UTR);

	# get the 5'-UTR exons and the 3'_UTR exons of the well known genes of the human genome
	# create an hash for 5'-UTRs and for 3'-UTRs
	# this hashes have as key the chromosome and as value an pointer to an array including pointers to all UTR exons
	# the UTR exons are described by an array consist of startpos., endpos., human gene name and strand 
	open IN, "zcat $knownGene |" || die("Can not open the file!\n");
	while( <IN> ) {
	    @line = split " ";
	    @end = split ",", $line[9];
	    @start = split ",", $line[8];
	    
	    # 5'-UTR exons
	    if( $utr5 ) {
	    	$i = -1;
	   	foreach $end ( @end ) {
			$i++;
			if( $end<=$line[5] ) {
		    		push( @{$utr{$line[1]}}, [$start[$i],$end,$line[0],$line[2]] );
			}
			elsif( $start[$i]<$line[5] ) {
		    		push( @{$utr{$line[1]}}, [$start[$i],$line[5]-1,$line[0],$line[2]] );
			}
	    	}
	    }
	    # 3'-UTR exons
	    else {
	    	@end = reverse @end;
	    	@start = reverse @start;
	   	$i = -1;
	    	foreach $end ( @end ) {
			$i++;
			if( $start[$i]>$line[6] ) {
		    		push( @{$utr{$line[1]}}, [$start[$i],$end,$line[0],$line[2]] );
			}
			elsif( $end>$line[6] ) {
		    		push( @{$utr{$line[1]}}, [$line[6]+1,$end,$line[0],$line[2]] );
			}
	    	}
	    }
	}
	close IN;

	# put input homo_refhash informations in four hashes
	foreach( keys %$homo_hashref ) {
		@homo = split ":", $$homo_hashref{$_};
		$cchr{$_} = $homo[0];
		$cstart{$_} = $homo[1];
		$cend{$_} = $homo[2];
		$cstrand{$_} = $homo[3];
	}

        # get all candidate ncRNAs which lie in an UTR
	foreach $est ( keys %cchr )  {
	    $chr = $cchr{$est};
	    foreach $exon ( @{$utr{$chr}} ) {
		if( $cstrand{$est} eq $$exon[3] ) {
		    $UTR{$est} = $$exon[2] if( $cstart{$est}<=$$exon[0] && $cend{$est}>$$exon[0] );
		    $UTR{$est} = $$exon[2] if( $cstart{$est}<$$exon[1] && $cend{$est}>=$$exon[1] );
		    $UTR{$est} = $$exon[2] if( $cstart{$est}>$$exon[0] && $cend{$est}<=$$exon[1] );
		}
	    }
	}

	return \%UTR;
}


=head2 B<compExistAlignments>

Comparison of the candidates of ncRNAs with structure alignments listed in a file using a reference organism to map ESTs to the existed alignments
    Input  - (1) reference to a hash including as key the unique identifier of an EST (id:start:end) and as value the chromosome, start, end and strand of the aligned sequence of the homologous organism seperated by ":"
	     (2) the file listing the alignments (per line one alignment consisting of chromosome, start, end, evalue and annotation seperated by space)
	     (3) minimal rate that EST covers the alignment necessary for a hit
    Output - number of hits (hits will be write to standard output in BED-format)

=cut

sub compExistAlignments {
	my ($self, $homo_hashref, $alignlistfile, $coverage) = @_;
	my (@line, %align, %sortalign, $key, @estitem, $item, $tmp);
	my $nr = 0;

	# read file listing structure alignments in mammalians
	open IN, $alignlistfile || die("Can not open the file!\n");
	while( <IN> ) {
        	next unless $_=~/^c/;
        	@line = split " ";
        	push( @{$align{$line[0]}}, [$line[1], $line[2], $line[3], $line[4]] );
	}
	close IN;

	# sort the structure alignments of each chromosome to startposition
	foreach( keys %align ) {
	        @{$sortalign{$_}} = sort { $$a[0] <=> $$b[0] } @{$align{$_}};
	}

	#foreach $chr ( keys %sortalign ) {
	#       foreach $item ( @{$sortalign{$chr}} ) {
	#               print "$chr = @$item\n";
	#       }
	#}

	# test candidates of ncRNA if they overlap with an structure alignment
	foreach $key ( keys %$homo_hashref ) {
        	@estitem = split ":", $$homo_hashref{$key};
		$tmp = $key;
		$tmp =~ s/\:/\|/g;
        	foreach $item ( @{$sortalign{$estitem[0]}} ) {
                	if( $estitem[1]<=$$item[0] && $estitem[2]>$$item[0] ) {
                        	if( $estitem[2]<$$item[1] ) {
                                	next if (($estitem[2]-$$item[0])/($$item[1]-$$item[0]))<$coverage;
                        	}
				print "$estitem[0]\t$$item[0]\t$$item[1]\t$tmp\t$$item[2]\t$estitem[3]\n";
                        	#print "$key @estitem @$item\n";
				$nr++;
                	}
                	elsif( $estitem[1]>$$item[0] && $estitem[1]<$$item[1] ) {
                        	next if (($$item[1]-$estitem[1])/($$item[1]-$$item[0]))<$coverage;
                        	next if (($estitem[2]-$estitem[1])/($$item[1]-$$item[0]))<$coverage;
				print "$estitem[0]\t$$item[0]\t$$item[1]\t$tmp\t$$item[2]\t$estitem[3]\n";
	                        #print "$key @estitem @$item\n";
				$nr++;
        	        }
        	}
	}

	return $nr;
}

1;

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.

=cut
