package EST2ncRNA::Pipeline;

use strict;
use warnings;

use EST2ncRNA::MysqlInterface;
use EST2ncRNA::ServerInterface;
use EST2ncRNA::RavennaInterface;
use EST2ncRNA::BlastInterface;
use EST2ncRNA::UCSCInterface;
use EST2ncRNA::RNAzInterface;
use EST2ncRNA::ClustalwInterface;
use EST2ncRNA::Annotation;

require Exporter;

our @ISA = qw(Exporter);  # inherits from Exporter

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(
		    );

our @EXPORT = qw();

our $VERSION = '0.01';


# constructor
sub new {
    my $class = shift @_;

    my $self = {
	_mysql        => undef,
	_server       => undef,
	_ravenna      => undef,
	_blast        => undef,
	_ucsc         => undef,
	_rnaz         => undef,
	_clustalw     => undef,
	_annotation   => undef,
	_uid          => undef,
	_workdir      => ".",
	_only_read_mysql => 0
    };
    bless $self, $class;    

    return $self;
}


# accessor method for Pipeline mysql-interface
#
# input  - datasource, username and password of the connected database
#          or nothing if db already exists
# output - mysql attribute
sub mysql {
    my ($self, $datasource, $username, $password) = @_;
    if( defined($datasource) ) {
	# create new EST2ncRNA::MysqlInterface class instance
	$self->{_mysql} = eval { new EST2ncRNA::MysqlInterface($datasource, $username, $password); } or die ($@);
    }
    return $self->{_mysql};
}


# accessor method for Pipeline server-interface
#
# input  - servername
#          or nothing if server instance already exists
# output - server attribute
sub server {
    my ($self, $servername) = @_;
    if( defined($servername) ) {
	# create new EST2ncRNA::ServerInterface class instance
	$self->{_server} = eval { new EST2ncRNA::ServerInterface($servername, $self->workdir, $self->uid); } or die ($@);
    }
    return $self->{_server};
}


# accesssor method for Pipeline ravenna-interface
#
# input  - homedirectory of RaveNnA, Rfam models file, Rfam seed file
#          or nothing if ravenna instance already exists
# output - ravenna attribute
sub ravenna {
    my ($self, $ravennahome, $rfammodel, $rfamseed) = @_;

    if( defined($rfammodel) ) {
	# create new EST2ncRNA::RavennaInterface class instance
	$self->{_ravenna} = eval { new EST2ncRNA::RavennaInterface($self->workdir,$ravennahome, $rfammodel, $rfamseed, $self->server); } or die ($@);
    }

    return $self->{_ravenna};
}


# accesssor method for Pipeline blast-interface
#
# output - blast attribute
sub blast {
    my ($self, $advancedblastn) = @_;

    if( ! defined $self->{_blast} ) {
	# create new EST2ncRNA::BlastInterface class instance
	$self->{_blast} = eval { new EST2ncRNA::BlastInterface($self->workdir, $self->server, $self->uid); } or die ($@);
    }

    return $self->{_blast};
}


# accesssor method for Pipeline ucsc-interface
#
# output - ucsc attribute
sub ucsc {
    my ($self) = @_;

    if( ! defined $self->{_ucsc} ) {
	# create new EST2ncRNA::UCSCInterface class instance
	$self->{_ucsc} = eval { new EST2ncRNA::UCSCInterface($self->workdir, $self->server, $self->uid); } or die ($@);
    }

    return $self->{_ucsc};
}


# accesssor method for Pipeline rnaz-interface
#
# output - rnaz attribute
sub rnaz {
    my ($self) = @_;

    if( ! defined $self->{_rnaz} ) {
	# create new EST2ncRNA::RNAzInterface class instance
	$self->{_rnaz} = eval { new EST2ncRNA::RNAzInterface($self->workdir, $self->server, $self->uid); } or die ($@);
    }

    return $self->{_rnaz};
}


# accesssor method for Pipeline clustalw-interface
#
# output - clustalw attribute
sub clustalw {
    my ($self) = @_;
    if( ! defined $self->{_clustalw} ) {
	# create new EST2ncRNA::ClustalwInterface class instance
	$self->{_clustalw} = eval { new EST2ncRNA::ClustalwInterface($self->workdir,$self->server, $self->uid); } or die ($@);
    }
    return $self->{_clustalw};
}


# accesssor method for Pipeline annotation
#
# output - annotation attribute
sub annotation {
    my ($self) = @_;

    if( ! defined $self->{_annotation} ) {
	$self->{_annotation} = eval { new EST2ncRNA::Annotation($self->workdir,$self->server); } or die ($@);
    }

    return $self->{_annotation};
}


# accesssor method for Pipeline uid
#
# output - unique ID for this pipeline instance
sub uid {
    my ($self) = @_;
    $self->{_uid} = $self->unique_id if !defined $self->{_uid};

    return $self->{_uid};
}


# accesssor method for Pipeline working directory on local machine and remote server
#
# input  - working directory on local machine and remote server
#	   or nothing if workdir instance already exists
# output - workdir attribute
sub workdir {
    my ($self, $workdir) = @_;
    if( defined $workdir ) {
	$self->{_workdir} = $workdir;
	# create workdir on lcla machine if not already exists
	mkdir $self->{_workdir} if ! -d $self->{_workdir};	
    }

    return $self->{_workdir};
}


# accesssor method for Pipeline only_read_mysql flag
#
# input  - boolean if database is only for read (optional)
# output - only_read_mysql attribute
sub only_read_mysql {
    my ($self, $readonly) = @_;
    $self->{_only_read_mysql} = $readonly if defined $readonly;
    
    return $self->{_only_read_mysql};
}


# create unique ID for temporary file
#
# output - unique ID
sub unique_id {
	my ($self) = @_;

	my $pid = "$$";                         		# current PID
	$pid = "a" . $pid;                      		# attach a letter to make it a string
	my @l = split(//,$pid);                 		# split and get the last 4 numbers (or the last 3 with the 'a')
	my $ID = $l[$#l-3] . $l[$#l-2] . $l[$#l-1] . $l[$#l];   # 4digit ID of this process

	return $ID;
}

# creates or cleans the pipeline database and stores the input assembled EST data in it
#
# input  - the filename of the assembled EST data
# output - the number of inserted assembled ESTs
sub store_ass_ests {
    my ($self, $estFile) = @_;
    my ($res);

    # prepares the db
    $self->mysql->prepare_db;
    # stores assembled ESTs in the db
    $res = $self->mysql->insert_assest($estFile);

    return $res;
}


# rejects small ESTs from the ncRNA candidates (recommended length is 60nts)
#
# input  - unusable length
# output - the number of rejected ESTs
sub reject_small_ests {
    my ($self, $length) = @_;
    my ($res);

    $res = $self->mysql->delete_small_ests_from_candncrna($length);

    return $res;
}


# rejects protein coding RNAs from the ncRNA candidates
# therefore blastx is uses against protein databases (f.e. 'nr', 'swissprot')
#
# input  - a reference to an array with all available blastx-result files
#          a reference to an array with the peptid sequence db files
#          [result files of Jan:
#           my $PATH = "/chromo1/a/users/pigadmin/projects/est/distiller/lib_analysis002/blast";
#           my @blast_res = ( "$PATH/distiller_clusters-nr-1.blast.mlevel.gz",
#                             "$PATH/distiller_singletons-nr-1.blast.mlevel.gz",
#                             "$PATH/distiller_clusters-sp-10.blast.mlevel.gz",
#                             "$PATH/distiller_singletons-sp-10.blast.mlevel.gz" );]
# output - the number of protein coding RNSs
sub reject_coding_rnas {
    my ($self, $blastxres_ref, $peptidDB_ref) = @_;
    my ($res);

    # run BLASTX against peptid dbs
    if( $#$peptidDB_ref>=0 ) {
	# implementation of Jan's blasting stuff
	# ....
	# add result blast file to the array @blastxres
    }
    print "@$blastxres_ref\n";
    $res = $self->mysql->insert_codingrna($blastxres_ref);

    return $res;
}


# rejects known ncRNAs from the ncRNA candidates
# stores all RaveNnA hits in the knownncrna-table and deletes plausible known ncRNAs from the candncrna-table
# uses the software RaveNnA assigning the sequences to covariance models of known ncRNAs using a statistical context-free grammar
# tests the RaveNnA results of their plaubilisity (included gaps, length, basepair rate)
#
# requirements for a succesful execution: a subdirectory 'ravenna' including
#   (1) the RaveNnA package (ravenna-0.2f.tar.gz) if $NEWINSTALL = 1
#   (2) the Rfam models (Rfam.tar.gz from 'http://www.sanger.ac.uk/Software/Rfam/ftp.shtml')
#   (3) the annotated seed alignments (Rfam.seed also from 'http://www.sanger.ac.uk/Software/Rfam/ftp.shtml')
#   (4) the qsub script (pbsubmit.pl) for using pbs jobs
#
# input  - home directory of RaveNnA
#          Rfam model file
#          annotated seed alignments file
#          servername
# output - the number of ESTs that are related to known ncRNAs
#          (returning number can be different to the number of entries in the knownncrna-table storing all hits because of multiple hits of one assembled EST and unplausible hits)
sub reject_known_ncrnas {
    my ($self, $ravennahome, $rfammodel, $rfamseed) = @_;
    my ($refparam, $refbp, $refwindow, $refknownncrna);
    my $res=0;

    # create EST2ncRNA::RavennaInterface class instance as attribute of object $pipeline
    $self->ravenna($ravennahome, $rfammodel, $rfamseed, $self->server);

    # create sequence fasta file with all candidate ncRNAs from the mySQL-db
    #$self->mysql->get_candncrna_fasta($self->ravenna->subdir."/db.fa.gz");

    # catch the family informations (RaveNnA parameters and basepair rate) from the Rfam.seed file
    ( $refparam, $refbp, $refwindow ) = $self->ravenna->catch_rfam_seed($rfamseed, "ALL");

    # run RaveNnA
    #$self->ravenna->run_ravenna($self->ravenna->subdir."/db.fa.gz", $rfammodel, $refparam);

    # analyse the results
#    $refknownncrna = $self->ravenna->filter_ravenna_hits($self->ravenna->subdir, $refbp, $self->ravenna->length,$self->mysql);
    $refknownncrna = $self->ravenna->filter_ravenna_hits("/home/users/seemann/pipeline/proteincoding/ravenna", $refbp, $refwindow, $self->mysql);

    # update the mySQL-db or write known ncRNAs in a file
    if( $self->only_read_mysql ) {
	open OUT, ">knownncrna.out" || die("Can not open the file!\n");
	foreach( @$refknownncrna ) {
		print OUT "${$_}[0]\t${$_}[1]\t${$_}[2]\t${$_}[3]\t${$_}[4]\t${$_}[5]\t${$_}[6]\n";
		$res++;
	}
	close OUT;
    }
    else {
    	$res = $self->mysql->insert_knownncrna($refknownncrna) unless $self->only_read_mysql;
    }

    return $res;
}


# stores the source organism (origin, f.e. pig) specific ESTs in the origspecest-table and deletes them from the candncrna-table
# uses BLASTN to find related sequences to a near related organism genome (f.e. cattle) and stores blastn-output in blastn-table
#
# input  - a fasta file with query sequences or '0' if the database (candncrna-table entries) should be used
#          a reference to a hash with one entry
#             key: the name of the query organism
#             value: a text file including per line the name of one chromosome FASTA-file of the near related organism
#                    (chromosome FASTA-files should start with a global ID for the chromosome ('gnl|organism|chromosome'))
#          using of advanced blastn parameter? (0..NO, location of the UniVec_Core file for blastn vector filtering..YES)
#             advanced blastn parameter need a highly increased calculation time!
#             therefore an adapted Serial BLAST Strategy is used:
#                 3 parts: search 1 with standard parameter,
#                          sequence retrieval,
#                          search 2 of ESTs against related chromosomes with advanced parameter
# output - the number of ESTs which are not conserved in the near related organism
sub find_conserved_rna_in_closed_organism {
    my ($self, $queryFile, $closedorg_hashref, $advancedblastn, $FORMATDB_HOME, $BLAST_HOME, $evalue, $length, $identity) = @_;
    my ($fastafiles_arrayref, $query, $blastres, $coname, $coanno, $closedorg, $line, $chrom, %chr, $advquery_ref, $blasttable, $file, $output);
    my $res = 0;

    # create EST2ncRNA::BlastInterface class instance as attribute of object $pipeline
    $self->blast;
    $self->blast->advblast($advancedblastn);
    $self->blast->formatdb($FORMATDB_HOME);
    $self->blast->blast($BLAST_HOME);
    $self->blast->evalue($evalue);
    $self->blast->length($length);
    $self->blast->identity($identity);
 
    my $SUBDIR = $self->blast->subdir;
    
    my $blastname = ($self->blast->advblast) ? "advblast" : "stdblast";

    # specify query-file name
    $query = ( $queryFile ) ? $queryFile : "$SUBDIR/blastquery".$self->uid;

    # catch the name of the near related organism and the fasta file name from input
    ( $coname, $coanno ) = (keys %$closedorg_hashref)[0] =~ /(\w+)\W*((\w+?).*)/;
    $closedorg = (values %$closedorg_hashref)[0];

    # test if blastnannotation already exists, if yes than the program dies
    $self->mysql->test_assembly($coname, $coanno, $self->blast->advblast);

    print "Search ESTs which are conserved in $coname.\n";

    # catch the names of chromosome files in a hash
    open IN, $closedorg || die("Can not open the file!\n");
     foreach $line ( <IN> ) {
        chomp $line;
	($chrom, $file) = split " ", $line;
        #( $chrom ) = $line =~ /(chr\w+)\./;
        $chr{$chrom} = $file;
    }

    # concatenate the chromosome files in the input file to one big genome file
    #my $subject = "$SUBDIR/blastsubject".$self->uid.".fa.gz";
    #`cat @chr > $subject`;

    #### STANDARD BLASTN ####

    # create sequence fasta file with all candidate ncRNAs from the mySQL-db
    $fastafiles_arrayref = $self->mysql->get_candncrna_fasta($query, $self->blast->seqnr) unless $queryFile;
    #$splits = `zcat $queryFile | grep ">" | wc -l` if $queryFile;

    # run blastn with standard parameter
    $output = "stdblastquery".$self->uid.".blast.gz";
    $output = $self->blast->run_standard_blastn($fastafiles_arrayref, \%chr, $output);

    # create table format of blast output
    $blastres = "$SUBDIR/stdblastquery".$self->uid.".blast.table.gz";
    unlink $blastres if -e $blastres;
    $blastres = "$SUBDIR/stdblastquery".$self->uid.".blast.gz";
    `kvlblast2table -M -u $blastres`;
    $blastres = "$SUBDIR/stdblastquery".$self->uid.".blast.table";
    `gzip $blastres`;
    $blastres = "$SUBDIR/stdblastquery".$self->uid.".blast.table.gz";
    $blasttable = $blastres;
    
    if( $queryFile || $self->only_read_mysql ) {
        # write the results in a file
        $res = $self->blast->get_blast_output_bed($blasttable, $coname.".".$queryFile.".".$blastname.".bed") if $queryFile;
    }
    else {
        # analyse the results and update the mySQL-db
        $self->mysql->insert_blastn($coname, $coanno, $blasttable, 0) unless $queryFile;
        # update estcoverage table
        $res = $self->mysql->insert_estcoverage_from_blastn($coname, $coanno, 0, \%chr, $self->blast->length, $self->blast->identity) unless $queryFile;
    }

    if( $self->blast->advblast ) {
	#### ADVANCED BLASTN ####

	# for all closed organism chromosomes in @chr creates a sequence fasta file with candidate ncRNAs which hit it
	$advquery_ref = $self->mysql->get_candncrna_fasta_from_blasthits(\%chr, $blastres, $self->blast->subdir, $self->uid);

        # run blastn with advanced parameter
	$output = "advblastquery".$self->uid.".blast.gz";
        $output = $self->blast->run_advanced_blastn($advquery_ref, \%chr, $output);

        # create table format of blast output
	$blastres = "$SUBDIR/advblastquery".$self->uid.".blast.table.gz";
        unlink $blastres if -e $blastres;
	$blastres = "$SUBDIR/$output";
        `kvlblast2table -M -u $blastres`;
	$blastres = "$SUBDIR/advblastquery".$self->uid.".blast.table";
	`gzip $blastres`;
	$blasttable = "$SUBDIR/advblastquery".$self->uid.".blast.table.gz";

    	if( $queryFile || $self->only_read_mysql ) {
        	# write the results in a file
        	$res = $self->blast->get_blast_output_bed($blasttable, $coname.".".$queryFile.".".$blastname.".bed") if $queryFile;
    	}
    	else {
        	# analyse the results and update the mySQL-db
        	$self->mysql->insert_blastn($coname, $coanno, $blasttable, $self->blast->advblast) unless $queryFile;
        	# update estcoverage table
        	$res = $self->mysql->insert_estcoverage_from_blastn($coname, $coanno, $self->blast->advblast, \%chr, $self->blast->length, $self->blast->identity) unless $queryFile;
    	}
    }

    return $res;
}


# searchs homologous sequences in other mammalians to the conserved ESTs
# uses available pairwise alignments of the UCSC Download Server (ftp://hgdownload.cse.ucsc.edu/goldenPath) and the UCSC tool liftOver
# how to get ".over.chain" files:
# (1) ftp to hgdownload.cse.ucsc.edu
# (2) go to the directory goldenPath/{target organism}
# (3) go to the directory liftOver (f.e. goldenPath/bosTau2/liftOver) for over.chain files
# (3) choice an over.chain file for the desired mammalian (f.e. for cow-mouse-alignment bosTau2ToMm7.over.chain)
#
# input  - a BED-file with query subsequence coordinates or '0' if the database (estcoverage-table entries) should be used
#          a reference to a hash with several entries
#             key: a text file including per line the name of one chromosome FASTA-file of the subject organism
#             value: the over.chain file
#          the minimum ratio of bases that must remap (recommended value is 0.8)
# output - hash with pairs of 'name of subject mammalian':'number of conserved ESTs'
sub search_homologous_sequences_in_other_mammalians_with_ucsc_over_chain {
    my ($self, $queryFile, $pa_hashref, $minMatch) = @_;
    my ($genome, $overchain, $query, $subject, $line, $chrom, %chr, $queryfile, $refcand);
    my %nr = ();

    # create EST2ncRNA::ServerInterface class instance 'localhost'
    my $server = eval { new EST2ncRNA::ServerInterface("localhost",".",0,0,0); } or die ($@);

    # create EST2ncRNA::UCSCInterface class instance as attribute of object $pipeline
    $self->ucsc;
    my $SUBDIR = $self->ucsc->subdir;

    foreach $genome (keys %$pa_hashref) {

        $overchain = $$pa_hashref{$genome};
        ( $query, $subject ) = $overchain =~ /(\w+)To(\w+)\./;

        print "Find pairwise alignments between $query and $subject.\n";

        # catch the names of the chromosome files in a hash
        open IN, "$genome" || die("Can not open the file!\n");
        foreach $line ( <IN> ) {
            chomp $line;
            ( $chrom ) = $line =~ /(chr\S+).fa.gz$/;
            $chr{$chrom} = $line;
        }
	close IN;
	
	# creates a BED-file including the estcoverage-table $query subsequence coordinates
	$queryfile = ( $queryFile ) ? $queryFile : $self->mysql->get_estcoverage_bed($query, $SUBDIR);
	
	# search pairwise alignments of the estcoverage-table $query subsequences to $subject in the UCSC chain-file
	# get back a hash with the pairs successfully mapped source EST : mapped subject subsequence
	$refcand = $self->ucsc->run_liftOver($query, $queryfile, $overchain, $minMatch);
	$nr{$subject} =  keys %$refcand;

	# analyse the results and update the 'estcoverage'-table of the mySQL-db
	unless( $queryFile || $self->only_read_mysql ) {
		$self->mysql->insert_estcoverage($refcand, $subject, \%chr);
	}
	else {
		open OUT, ">$genome.over.chain.out" || die("Can not open the file!\n");
		map { print OUT "$_\t$$refcand{$_}\n" } keys %$refcand;
		close OUT;
	}
    }

    return \%nr;
}


# searchs homologous sequences in several mammalians to the conserved ESTs
# uses available multiple alignments of the UCSC Download Server (ftp://hgdownload.cse.ucsc.edu/goldenPath)
# how to get "maf" files:
# (1) go to the directory multiz{nr}way (f.e. goldenPath/hg17/multiz17way)
# (2) select the maf files of all chromosomes (f.e. chr1.maf.gz ..)
#
# input  - a text-file with query subsequence coordinates or '0' if the database (estcoverage-table entries) should be used
#	   a reference to a hash with several entries
#             key: the name of the query organism
#             value: a text file including per line the path of one maf-file
#          the minimum ratio of bases that must remap (recommended value is 0.6)
# output - hash with pairs of 'name of subject mammalian':'number of conserved ESTs'
sub search_homologous_sequences_in_other_mammalians_with_ucsc_maf {
    my ($self, $queryFile, $ma_hashref, $minMatch) = @_;
    my ($queryorg, $maf, $OUTPUTFILE, $line, $chrom, %chr, $queryfile, $mafout_ref, $sid);
    my %nr = ();

    # create EST2ncRNA::UCSCInterface class instance as attribute of object $pipeline
    $self->ucsc($self->server);
    my $SUBDIR = $self->ucsc->subdir;

    foreach $queryorg ( keys %$ma_hashref ) {

        $maf = $$ma_hashref{$queryorg};
	$OUTPUTFILE = $queryorg.".mafout.txt.gz";

        print "Find multiple alignments to $queryorg.\n";

        # catch the names of the multiple alignment files (per chromosome one) in a hash
        open IN, "$maf" || die("Can not open the file!\n");
        foreach $line ( <IN> ) {
            chomp $line;
            ( $chrom ) = $line =~ /(chr\S+).maf.gz$/;
            $chr{$chrom} = $line;
        }
        close IN;

	# creates a file including the estcoverage-table $query subsequence coordinates
	$queryfile = $self->mysql->get_estcoverage_for_maf($queryorg, $SUBDIR) unless $queryFile;
	# or fetch the $query subsequence coordinates from a bed-file
	$queryfile = $self->ucsc->get_mafscan_input($queryFile, $queryorg) if $queryFile;

	# search multiple alignments of estcoverage-table subsequences of the source or a subject organism in the UCSC maf-file
	# get back a hash with pairs of successfully mapped estcoverage primary key vs remaining estcoverage values
	# (est_id:est_start:subject_name:align_type vs subject_start:subject_end:subject_seq)
	$mafout_ref = $self->ucsc->scan_maf($queryfile, \%chr, $queryorg, $minMatch, $OUTPUTFILE);

	# count the occurrence of every organism
	foreach( keys %$mafout_ref ) {
	    $sid = ( split ":" )[2];
	    $sid =~ /^gnl\|(\w+)/;
	    if( exists $nr{$1} ) {
		$nr{$1}++;
	    }
	    else {
		$nr{$1} = 1;
	    }
	}

	# analyse the results
	if( $queryFile || $queryFile ) {
		# get BED-file of $self->ucsc->scan_maf output
	        $self->ucsc->mafscan2bed($OUTPUTFILE, $queryorg.".mafout.bed") if $queryFile;
	}
	else {	
		# analyse the results and update the 'estcoverage'-table of the mySQL-db
		$self->mysql->insert_estcoverage($mafout_ref) unless $queryFile;
	}
    }

    return \%nr;
}


# executes clustalw to get alignments of the candidate ESTs
#
# output - reference to an array including all alignment maf-files
sub align_homologous_sequences {
    my ($self) = @_;
    my ($refalnfiles, $aln_refhash, $refmaffiles);
    my $MAFLENGTH = 10000;

    # create EST2ncRNA::RNAzInterface class instance as attribute of object $pipeline
    #$self->rnaz;
    #my $SUBDIR = $self->rnaz->subdir;

    # create EST2ncRNA::ClustalwInterface class instance as attribute of object $pipeline
    $self->clustalw;
    my $SUBDIR = $self->clustalw->subdir;

    # creates fasta-files including one EST and its homologous sequences
    $refalnfiles = $self->mysql->get_homologous_fasta($SUBDIR."/".$self->clustalw->fastadir);

    # run clustalw to get for each candidate EST a multiple alignment using EST-coverage data
    $self->clustalw->calculate_multiple_alignments();

    # returns for each EST a hash including its homologous sequences
    $aln_refhash = $self->mysql->get_homologous;

    # create maf-files using the clustalw output
    $refmaffiles = $self->clustalw->aln2maf($aln_refhash, $MAFLENGTH);

    return $refmaffiles;  
}

# executes RNAz to predict novel non-coding RNAs
#
# input  - (1) reference to an array including all alignment maf-files
#          (2) Boolean if shuffled alignments should be used for significance test
#	   (3) optional version (standard is "best alignments" using all estcoverage alignments, otherwise a list of used organism names seperated by ":")
# output - number of novel non-coding RNAs
sub search_rna_genes {
    my ($self, $refmaffiles, $rnazhome, $rnazscripthome, $random, $version) = @_;
    my ($rnazout, $rnazclusterout, $refprepmaffiles, $rnazoutfile, $clusteredoutfile, $nr, $res);
    my $MAFLENGTH = 10000;

    # create EST2ncRNA::RNAzInterface class instance as attribute of object $pipeline
    $self->rnaz($self->server);
    $self->rnaz->version($version) if defined $version;
    $self->rnaz->rnazhome($rnazhome);
    $self->rnaz->rnazscripthome($rnazscripthome);
    my $HOME = $self->server->home;
    my $SUBDIR = $self->rnaz->subdir("proteincoding");
   
    $rnazout = ( $random ) ? "est.rnaz.random.out" : "est.rnaz.out";
    $rnazclusterout = ( $random ) ? "est.rnaz.random.results.dat" : "est.rnaz.results.dat";
 
    # start rnazWindow.pl of the RNAz package to slice, pre-process and filter the alignments
    $refprepmaffiles = $self->rnaz->run_rnazWindow($refmaffiles);
	
    # randomize alignments (mononucleotide shuffling)
    $refprepmaffiles = $self->rnaz->run_rnazRandomizeAln("$HOME/$SUBDIR", $refprepmaffiles) if $random;

    # run RNAz with a P value cutoff of 0.5
    $rnazoutfile = $self->rnaz->run_rnaz($refprepmaffiles, $rnazout);

    # clustering the results
    $clusteredoutfile = $self->rnaz->run_rnazCluster($rnazoutfile, $rnazclusterout);

    # analyse the results and update the mySQL-db
    unless( $random || $self->only_read_mysql ) {
    	$self->mysql->insert_rnazwindows($SUBDIR."/".$rnazoutfile, $SUBDIR."/".$clusteredoutfile, $self->rnaz->version);
    }

    # return number of ncRNA candidates
    $nr = $self->mysql->get_nr_candncrna;

    # get estimated false positive rate (significance test)
    if( $random) {
    	$nr = $self->mysql->RNAz_statistics($SUBDIR."/".$clusteredoutfile, 0.5);
    	$res = "False positive rate of RNAz prediction with p-value 0.5 is $nr.\n";
    	$nr = $self->mysql->RNAz_statistics($SUBDIR."/".$clusteredoutfile, 0.9);
    	$res = $res."False positive rate of RNAz prediction with p-value 0.9 is $nr.\n";
	$nr = $res;
    }
	
    # return number of ncRNA candidates or false positive rate
    return $nr;
}


# annotate tRNAs in candncrna-table and knownncrna-table with tRNAscan-SE
sub annotate_tRNA {
    my ($self) = @_;
    my ($SUBDIR, $fastafile, $resultfile, $secstrfile, $statisticfile, $std, $nr, $anno_refhash);
    my @res;

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation;
    $SUBDIR = $self->annotation->subdir;

    # find tRNAs in the RNAz output
    $fastafile = "$SUBDIR/candncrna.fasta.gz";
    $resultfile = "$SUBDIR/tRNA.cand.result";
    $std = q{select ass_id, cand_start, cand_end, ass_sequence from assest, candncrna where cand_id=ass_id};
    $self->mysql->get_statement_fasta($std, 1, $fastafile);
    $anno_refhash = $self->annotation->tRNAscan_SE($fastafile, $resultfile);

    if( $self->only_read_mysql ) {
	$nr = 0;
       	# print hits to stdout
        map{ print "$_\t$$anno_refhash{$_}\n"; $nr++} keys %$anno_refhash;	
    }
    else {
	# insert annotation in the candncrna-table
	$nr = $self->mysql->insert_cand_annotation($anno_refhash);
    }

    push @res, "$nr novel ncRNAs are annotated as tRNAs.";

    # find tRNAs in the known ncRNAs predicted by RaveNnA
    $fastafile = "$SUBDIR/knownRF00005.fasta.gz";
    $resultfile = "$SUBDIR/tRNA.known.result";
    $std = q{select ass_id, known_start, known_end, ass_sequence from assest, knownncrna where ass_id=known_id and known_family="RF00005" and known_status="HIT"};
    $self->mysql->get_statement_fasta($std, 1, $fastafile);
    $anno_refhash = $self->annotation->tRNAscan_SE($fastafile, $resultfile);

    if( $self->only_read_mysql ) {
        $nr = 0;
        # print hits to stdout
        map{ print "$_\t$$anno_refhash{$_}\n"; $nr++} keys %$anno_refhash;
    }
    else {
	# insert annotation in knownncrna-table 	
    	$nr = $self->mysql->update_known_ncRNA($anno_refhash);
    }

    push @res, "$nr known ncRNAs are annotated as tRNAs.";

    return \@res;
}


# Blast searches in a non-coding RNA database (Rfam, NONCODE) annotating entired ESTs in candncrna-table entries without poly-A tail and write annotation in the annotatedncrna table
sub annotate_using_ncRNA_db {
    my ($self, $queryFile, $db_arrayref, $FORMATDB_HOME, $BLAST_HOME) = @_;
    my ($BLASTSUBDIR, $query, $output, $identity, $subcoverage, $anno_refhash,%res);
    my $splits = 0;
    my $nr = 0;

    # create EST2ncRNA::BlastInterface class instance as attribute of object $pipeline
    $self->blast($self->server);
    $self->blast->blast($BLAST_HOME);
    $self->blast->formatdb($FORMATDB_HOME);
    $BLASTSUBDIR = $self->blast->subdir;

    # create sequence fasta file with all candidate ncRNAs from the mySQL-db
    $query = ( $queryFile ) ? $queryFile : "$BLASTSUBDIR/dbquery.fa.gz";
    $splits = $self->mysql->get_candncrna_fasta_without_polyA($query) unless $queryFile;

    # calculate splits of input data that every blast query included circa 200 sequences
    $splits = int($splits/200);

    foreach( @$db_arrayref ) {
	$output = ( reverse( split "\/", $_ ) )[0];

	# run blastn with standard parameter (e-value<1e-20)
	$self->blast->run_standard_blastn($query, $_, $splits, $output.".blast.gz");

	# create table format of blast output
	unlink "$BLASTSUBDIR/$output.blast.table" if -e "$BLASTSUBDIR/$output.blast.table";
	`kvlblast2table -M -u $BLASTSUBDIR/$output.blast.gz`;

	# extract significant hits from blast output
	$identity = 0.95;
	$subcoverage = 0.85;
	$anno_refhash = $self->blast->filter_blast_output("$BLASTSUBDIR/$output.blast.table", $_, $identity, $subcoverage);

	if( $queryFile || $self->only_read_mysql ) {
		$nr = 0;
		# print hits to stdout
		print ">$output:\n";
		map{ print "$_\t$$anno_refhash{$_}\n"; $nr++} keys %$anno_refhash;
	}
	else {
		# insert annotated ESTs in the annotatedncrna-table
		$nr = $self->mysql->insert_annotatedncrna($anno_refhash);
	}
	$res{$output} = $nr;
    } 

    return \%res;         
}

# Blast searches in non-coding RNA databases (Rfam, NONCODE) annotating predicted subsequences of entries in candncrna-table and knownncrna-table and updateannotation in this tables; relaxed criteria for BLASTN hits are evalue<0.1, identity>85% and subject coverage>60%
sub annotate_subsequences_using_ncRNA_db {
    my ($self, $db_arrayref, $FORMATDB_HOME, $BLAST_HOME) = @_;
    my ($SUBDIR, $BLASTSUBDIR, $query, $splits, $nr, $anno_refhash, %res, $blastfile, $db, $evalue, $identity, $subcoverage);

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation($self->server);
    $SUBDIR = $self->annotation->subdir;

    # create EST2ncRNA::BlastInterface class instance as attribute of object $pipeline
    $self->blast($self->server);
    $self->blast->blast($BLAST_HOME);
    $self->blast->formatdb($FORMATDB_HOME);
    $BLASTSUBDIR = $self->blast->subdir;

    # create sequence fasta file with all candidate ncRNAs from the mySQL-db
    $query = q(select ass_id, cand_start, cand_end, ass_sequence from assest, candncrna where cand_id=ass_id);
    $splits = $self->mysql->get_statement_fasta($query, 1, "$SUBDIR/dbquery.fa.gz", 1);

    # calculate splits of input data that every blast query included circa 200 sequences
    $splits = int($splits/200);

    foreach( @$db_arrayref ) {
    	$db = $_;
    	# run blastn with standard parameter (e-value<1e-20)
    	$blastfile = ( reverse(split "\/", $db) )[0];
    	$blastfile = $blastfile.".novelncrna.blast";
    	$evalue = 1e-1; #0.1
    	$self->blast->run_standard_blastn("$SUBDIR/dbquery.fa.gz", $db, $splits, $blastfile.".gz", $evalue);

    	# create table format of blast output
	unlink "$BLASTSUBDIR/$blastfile.table" if -e "$BLASTSUBDIR/$blastfile.table";
    	`kvlblast2table -M -u $BLASTSUBDIR/$blastfile.gz`;

	# extract significant hits from blast output and create pairs of id to annotation of blast output
        $identity = 0.85;
        $subcoverage = 0.6;
        $anno_refhash = $self->blast->filter_blast_output("$BLASTSUBDIR/$blastfile.table", $db, $identity, $subcoverage);
    	#$anno_refhash = $self->annotation->db_hits("$BLASTSUBDIR/$blastfile.table", $db);

    	# insert annotation in the candncrna-table
	if( $self->only_read_mysql ) {
                $nr = 0;
                # print hits to stdout
                print ">$blastfile:\n";
                map{ print "$_\t$$anno_refhash{$_}\n"; $nr++} keys %$anno_refhash;
        }
        else {
		$nr = $self->mysql->insert_cand_annotation($anno_refhash);
	}
    	$res{$blastfile} = $nr;
    }

    # create sequence fasta file with all known ncRNAs from the mySQL-db without tRNAs
    $query = q(select known_id, known_start, known_end, ass_sequence from assest, knownncrna where known_id=ass_id and known_status="HIT");
    $splits = $self->mysql->get_statement_fasta($query, 1, "$SUBDIR/dbquery.fa.gz", 1);

    # calculate splits of input data that every blast query included circa 200 sequences
    $splits = int($splits/200);

    foreach( @$db_arrayref ) {
    	$db = $_;
    	# run blastn with standard parameter
    	$blastfile = ( reverse(split "\/", $db) )[0];
    	$blastfile = $blastfile.".knownncrna.blast";
	$evalue = 1e-1; #0.1
    	$self->blast->run_standard_blastn("$SUBDIR/dbquery.fa.gz", $db, $splits, $blastfile.".gz", $evalue);

    	# create table format of blast output
	unlink "$BLASTSUBDIR/$blastfile.table" if -e "$BLASTSUBDIR/$blastfile.table";
    	`kvlblast2table -M -u $BLASTSUBDIR/$blastfile.gz`;

    	# extract significant hits from blast output and create pairs of id to annotation of blast output
    	$identity = 0.85;
    	$subcoverage = 0.6;
    	$anno_refhash = $self->blast->filter_blast_output("$BLASTSUBDIR/$blastfile.table", $db, $identity, $subcoverage);
   	#$anno_refhash = $self->annotation->db_hits("$BLASTSUBDIR/$blastfile.table", $db);

    	# update annotation in the knownncrna-table
    	if( $self->only_read_mysql ) { 
		$nr = 0;
        	# print hits to stdout
        	print ">$blastfile:\n";
        	map{ print "$_\t$$anno_refhash{$_}\n"; $nr++} keys %$anno_refhash;
    	}
   	else {
    		$nr = $self->mysql->update_known_ncRNA($anno_refhash);
    	}
   	$res{$blastfile} = $nr;
    }

    return \%res;
}


# annotate microRNAs with Janas tool RNAmicro
sub annotate_microRNA {
    my ($self, $RNAMICRO_HOME, $RNAMICRO_MODELDIR, $random) = @_;
    my ($OLDALNSUBDIR, $ALNDIR, $SUBDIR, $HOME, $ID, $plusfile, $minusfile, $files_ref, $micro_refhash, %anno, $alnfolder, $nr, $res, $estloci, %estloci);

    # create EST2ncRNA::RNAzInterface class instance as attribute of object $pipeline (RNAmicro needs the align-folder)
    $self->rnaz($self->server);
    $OLDALNSUBDIR = $self->rnaz->subdir;
    $ALNDIR = "aln";

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation($self->server);
    $self->annotation->rnamicrohome($RNAMICRO_HOME);
    $self->annotation->rnamicromodeldir($RNAMICRO_MODELDIR);

    $HOME = $self->server->home;
    $SUBDIR = $self->annotation->subdir;

    $ID = $self->unique_id;
    $plusfile = ( $random ) ? "rnamicro.cand.plus.random.$ID.out" : "rnamicro.cand.plus.$ID.out";
    $minusfile = ( $random ) ? "rnamicro.cand.minus.random.$ID.out" : "rnamicro.cand.minus.$ID.out";

    # get filenames of clustalw alignments of candncrnas
    $files_ref = $self->mysql->cand_aln_files();

    # prepare alignment files and copy them to a special folder
    $alnfolder = "$HOME/$SUBDIR/$ALNDIR";
#    $self->annotation->prepare_aln_files($OLDALNSUBDIR, $ALNDIR);

    # shuffle the input alignments if a significance test is desired
    $files_ref = $self->rnaz->run_rnazRandomizeAln($alnfolder, $files_ref) if $random;

    # create pairs of id to annotation as microRNA
    $micro_refhash = $self->annotation->RNAmicro($alnfolder, $files_ref, $plusfile, $minusfile);

    if( !$random ) {  
    	if( $self->only_read_mysql ) {
		# print hits to file
		open OUT, ">$SUBDIR/rnamicro.$ID.out" || die("Can not open the file!\n");
	        map{ print OUT "$_\t$$micro_refhash{$_}\n"; $nr++} keys %$micro_refhash;
		close OUT;
    	}
        else { 
    		# insert microRNAs in the microrna-table
    		$nr = $self->mysql->insert_microRNA($micro_refhash);
    	}
    }
    else {
       	# get estimated false positive rate (significance test)
    	$nr = $self->mysql->RNAmicro_statistics($micro_refhash, 0.5);
	$res = "False positive rate of RNAmicro prediction with p-value 0.5 is $nr.\n";
    	$nr = $self->mysql->RNAmicro_statistics($micro_refhash, 0.9);
	$res = $res."False positive rate of RNAmicro prediction with p-value 0.9 is $nr.\n";
        $nr = $res;
    }

    # return number of ncRNA candidates or false positive rate	
    return $nr;
}


# annotate snoRNAs with Janas tool snoReport
sub annotate_snoRNA {
    my ($self, $SNOREPORT_HOME, $SNOREPORT_MODELDIR, $random) = @_;
    my ($OLDALNSUBDIR, $ALNDIR, $SUBDIR, $HOME, $ID, $plusfile, $minusfile, $files_ref, $sno_refhash, %anno, $alnfolder, $nr, $res);

    # create EST2ncRNA::RNAzInterface class instance as attribute of object $pipeline (snowReport needs the align-folder)
    $self->rnaz($self->server);
    $OLDALNSUBDIR = $self->rnaz->subdir;
    $ALNDIR = "aln";

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation($self->server);
    $self->annotation->snoreporthome($SNOREPORT_HOME);
    $self->annotation->snoreportmodeldir($SNOREPORT_MODELDIR);

    $HOME = $self->server->home;
    $SUBDIR = $self->annotation->subdir;
   
    $ID = $self->unique_id; 
    $plusfile = ( $random ) ? "snoreport.cand.plus.random.$ID.out" : "snoreport.cand.plus.$ID.out";
    $minusfile = ( $random ) ? "snoreport.cand.minus.random.$ID.out" : "snoreport.cand.minus.$ID.out";

    # get filenames of clustalw alignments of candncrnas
    $files_ref = $self->mysql->cand_aln_files();

    # prepare alignment files and copy them to a special folder
    $alnfolder = "$HOME/$SUBDIR/$ALNDIR";
    #$self->annotation->prepare_aln_files($OLDALNSUBDIR, $ALNDIR);

    # shuffle the input alignments if a significance test is desired
    $files_ref = $self->rnaz->run_rnazRandomizeAln($alnfolder, $files_ref) if $random;

    # create pairs of id to annotation as snoRNA
    $sno_refhash = $self->annotation->snoReport($alnfolder, $files_ref, $plusfile, $minusfile);

    if( !$random ) {
        if( $self->only_read_mysql ) {
        	# print hits to file
                open OUT, ">$SUBDIR/snoreport.$ID.out" || die("Can not open the file!\n");
                map{ print OUT "$_\t$$sno_refhash{$_}\n"; $nr++} keys %$sno_refhash;
		close OUT;
        }
        else {
		# insert snoRNAs in the snorna-table
    		$nr = $self->mysql->insert_snoRNA($sno_refhash);
        }
    }
    else {
        # get estimated false positive rate (significance test)
    	$nr = $self->mysql->get_snoRNA_statistics($sno_refhash, 0.5);
	$res = "False positive rate of snoRNA prediction with p-value 0.5 is $nr.\n";	
	$nr = $self->mysql->get_snoRNA_statistics($sno_refhash, 0.9);
	$res = $res."False positive rate of snoRNA prediction with p-value 0.9 is $nr.\n";
        $nr = $res;	
    }

    return $nr;
}

=head2 B<annotUTRs>

Annotate ncRNA candidates as 5'- or 3'-UTRs using the gene annotation of a homologous organism
    Input  - (1) rnazCluster.pl output file (the RNAz output file has to lie in the same location with same file root and the ending "out") or "0" if the RNAz results should be taken from the mysql database
             (2) reference to an hash with key = homologous organism name and value = gene annotation file (entire path)
    Output -  writes UTR-annotation in database or in files 'UTR[3|5].out.txt' if I<-queryFile> or I<-mysql_read_only> is set

=cut
sub annotate_UTRs {
    my ($self, $queryFile, $knownGene_hashref) = @_;
    my ($org, $homo_hashref, $utr5_hashref, $utr3_hashref, @key, @res, %anno, $nr, @line);

    # create EST2ncRNA::ServerInterface class instance 'localhost'
    my $server = eval { new EST2ncRNA::ServerInterface("localhost",".",0,0,0); } or die ($@);

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation($server);

    foreach $org ( keys %$knownGene_hashref ) {
	# get all candidates of ncRNAs and the homologous sequences of the organism $org
	if( $queryFile ) {
		$homo_hashref = $self->annotation->cand_homologous_seq($org,$queryFile);
	}
	else {
		$homo_hashref = $self->mysql->cand_homologous_seq($org);
	}

	# write homologous sequences in a file
	open OUT, ">est.rnaz.$org.homologs.txt" || die("Can not open the file!\n");
	map{ @key=split ":",$_; @line=split ":",$$homo_hashref{$_}; if(defined $line[1]){print OUT "$key[0]\t$key[1]\t$key[2]\t$line[0]\t$line[1]\t$line[2]\t$line[3]\n"}else{print OUT "$key[0]\t$key[1]\t$key[2]\t$line[0]\n"}} keys %$homo_hashref;
	close OUT;

    	# scan the knownGenes file and annotate all ncRNA candidates in 5'-UTR regions
    	$utr5_hashref = $self->annotation->annotUTRs($$knownGene_hashref{$org}, $homo_hashref, 1);

    	# update annotation in the candncrna-table
    	map { @key = split ":"; $anno{$key[0].":".$key[1]} = "5' ($$utr5_hashref{$_})" } keys %$utr5_hashref;
	if( $queryFile || $self->only_read_mysql ) {
		my ($cid,$cstart,$cend);
		$nr = 0;
	       	open OUT, ">UTR5.out.txt" || die("Can not open the file!\n");
        	foreach( keys %$utr5_hashref ) {
                	($cid,$cstart,$cend) = split ":";
                	print OUT "$cid\t$cstart\t$cend\t$$utr5_hashref{$_}\n";
			$nr++;
        	}
        	close OUT;
	}
	else {
    		$nr = $self->mysql->insert_cand_utr(\%anno);
	}
    	push @res, "$nr novel ncRNAs are annotated as 5'-UTRs.";

	# scan the knownGenes file and annotate all ncRNA candidates in 3'-UTR regions
        $utr3_hashref = $self->annotation->annotUTRs($$knownGene_hashref{$org}, $homo_hashref, 0);

	# update annotation in the candncrna-table
    	%anno = ();
    	map { @key = split ":"; $anno{$key[0].":".$key[1]} = "3' ($$utr3_hashref{$_})" } keys %$utr3_hashref;
	if( $queryFile || $self->only_read_mysql ) {
		my ($cid,$cstart,$cend);
                $nr = 0;
                open OUT, ">UTR3.out.txt" || die("Can not open the file!\n");
                foreach( keys %$utr3_hashref ) {
                        ($cid,$cstart,$cend) = split ":";
                        print OUT "$cid\t$cstart\t$cend\t$$utr3_hashref{$_}\n";
                        $nr++;
                }
                close OUT;
	}
	else {
    		$nr = $self->mysql->insert_cand_utr(\%anno);
	}
    	push @res, "$nr novel ncRNAs are annotated as 3'-UTRs.";
    }	

    return \@res;
}


# Comparison of the candidates of ncRNAs with structure alignments listed in a file using a reference organism to map ESTs to the existed alignments
sub annotate_known_alignments {
    my ($self, $alignlist_hashref, $coverage) = @_;
    my ($org, $homo_hashref, $nr);

    # create EST2ncRNA::ServerInterface class instance 'localhost'
    my $server = eval { new EST2ncRNA::ServerInterface("localhost",".",0,0,0); } or die ($@);

    # create EST2ncRNA::Annotation class instance as attribute of object $pipeline
    $self->annotation($server);

    foreach $org ( keys %$alignlist_hashref ) {
	# get all candidates of ncRNAs and the homologous sequences of the organism $org
    	$homo_hashref = $self->mysql->cand_homologous_seq($org);

    	# compare candidate ncRNAs with known structural alignments
    	$nr = $self->annotation->compExistAlignments($homo_hashref, $$alignlist_hashref{$org}, $coverage);
    }

    return $nr;
}

1;

__END__

=head1 NAME

C<EST2ncRNA::Pipeline> - EST to ncRNA pipeline interface

=head1 SYNOPSIS

    use EST2ncRNA::Pipeline;

    $pipeline = eval { new EST2ncRNA::Pipeline; } or die ($@);

    $pipeline->mysql($DATA_SOURCE, $USERNAME, $PASSWORD);
    $pipeline->server($SERVER, $WORKDIR, $QUEUE, $NODES, $WALLT);
    $pipeline->server->prepare_dir_tree;

    $res = $pipeline->store_ass_ests($estFile);
    $res = $pipeline->reject_small_ests($length);
    $res = $pipeline->reject_coding_rnas(\@blastxResultsFiles, \@peptidDBs);
    $res = $pipeline->reject_known_ncrnas($RAVENNA_HOME, $RfamModel, $RfamSeed);
    $res = $pipeline->find_conserved_rna_in_closed_organism(\%closedorganismToGenomicDataFile, $advancedBlastn);
    $refreshash = $pipeline->search_homologous_sequences_in_other_mammalians_with_ucsc_over_chain(\%ucscPairwiseAlign, $minMatch);
    $refreshash = $pipeline->search_homologous_sequences_in_other_mammalians_with_ucsc_maf(\%ucscMultipleAlign, $minMatch);
    $res = $pipeline->search_rna_genes;
    $pipeline->annotate_tRNAs();

=head1 DESCRIPTION

The Pipeline Module offers all steps to extract from an EST dataset all candidates of non-coding RNAs. The Pipeline works closed together with a MYSQL database storing all available data during the pipeline and a server interface with allows to source out the CPU and memory expensive tasks on a remote machine. Therefore it is necessary to establish the C<mysql> and the C<server> attribute and to create a temporary working directory structure on the server before the real pipeline work can start. It has as further attributes a RavennaInterface object, a BlastInterface object, a UCSCInterface object, a RNAzInterface object and a ClustalwInterface object.

=head1 METHODS

=head2 B<mysql>

Creates with parameters a new EST2ncRNA::MysqlInterface class instance; otherwise returns an existed EST2ncRNA::MysqlInterface class instance.
    Input  - C<datasource>, C<username> and C<password> of the connected database
    Output - EST2ncRNA::MysqlInterface class instance

=head2  B<server>

Creates with parameters a new EST2ncRNA::ServerInterface class instance; otherwise returns an existed EST2ncRNA::ServerInterface class instance. If all processes should be run local the server can created with:

           $pipeline->server("localhost", $WORKDIR, 0, 0, 0);

    Input  - C<servername>, C<workdir>, C<queue>, C<nodes> and C<walltime>
    Output - EST2ncRNA::ServerInterface class instance

=head2 B<ravenna>

Creates with parameters a new EST2ncRNA::RavennaInterface class instance; otherwise returns an existed EST2ncRNA::RavennaInterface class instance.
    Input  - C<homedirectory of RaveNnA>, C<Rfam models file>, C<Rfam seed file> and C<server instance>
    Output - EST2ncRNA::RavennaInterface class instance

=head2 B<blast>

Creates with parameters a new EST2ncRNA::BlastInterface class instance; otherwise returns an existed EST2ncRNA::BlastInterface class instance.
    Input  - C<boolean if advanced blastn> and C<server instance>
    Output - EST2ncRNA::BlastInterface

=head2 B<ucsc>

Creates with parameters a new EST2ncRNA::UCSCInterface class instance; otherwise returns an existed EST2ncRNA::UCSCInterface class instance.
    Input  - C<server instance>
    Output - EST2ncRNA::UCSCInterface class instance

=head2 B<rnaz>

Creates with parameters a new EST2ncRNA::RNAzInterface class instance; otherwise returns an existed EST2ncRNA::RNAzInterface class instance.
    Input  - C<server instance>
    Output - EST2ncRNA::RNAzInterface class instance

=head2 B<clustalw>

Creates with parameters a new EST2ncRNA::ClustalwInterface class instance; otherwise returns an existed EST2ncRNA::ClustalwInterface class instance.
    Input  - C<server instance> and C<subdir>
    Output - C<server instance> and C<subdir>

=head2 B<store_ass_ests>

Creates or cleans the pipeline database and stores the input assembled EST data in it.
    Input  - the filename of the assembled EST data.
    Output - the number of inserted assembled ESTs.

=head2 B<reject_small_ests>

Rejects small ESTs from the ncRNA candidates (recommended length is 60nts)
    Input  - unusable length.
    Output - the number of rejected ESTs.

=head2 B<reject_coding_rnas>

Rejects protein coding RNAs from the ncRNA candidates, therefore blastx is uses against protein databases (f.e. 'nr', 'swissprot')
    Input  - a reference to an array with all available blastx-result files
             a reference to an array with the peptid sequence db files
    Output - the number of protein coding RNAs

=head2 B<find_conserved_rna_in_closed_organism>

Stores the source organism (origin, f.e. pig) specific ESTs in the origspecest-table and deletes them from the candncrna-table. It uses BLASTN to find related sequences to a near related organism genome (f.e. cattle) and stores blastn-output in blastn-table. If advanced blastn parameters are selected an adapted Serial BLAST Strategy consisting of 3 parts is used: (1) search 1 with standard parameter, (2) sequence retrieval, (3) search 2 of ESTs against related chromosomes with advanced parameter.
    Input  - a reference to a hash with one entry
                 key: the name of the query organism
                 value: a text file including per line the name of one chromosome FASTA-file of the near related organism
                        (chromosome FASTA-files should start with a global ID for the chromosome ('gnl|organism|chromosome'))
             using of advanced blastn parameter? (0..NO, location of the UniVec_Core file for blastn vector filtering..YES)
    Output - the number of ESTs which are not conserved in the near related organism
    B<REQUISIT:> the perl script C<kvlblast2table> in the working directory

=head2 B<search_homologous_sequences_in_other_mammalians_with_ucsc_over_chain>

Searchs homologous sequences in other mammalians to the conserved ESTs. It uses available pairwise alignments of the UCSC Download Server (ftp://hgdownload.cse.ucsc.edu/goldenPath) and the UCSC tool liftOver.
    Input  - a reference to a hash with several entries
                 key: a text file including per line the name of one chromosome FASTA-file of the subject organism
                 value: the over.chain file
             the minimum ratio of bases that must remap (recommended value is 0.8)
    Output - hash with pairs of 'name of subject mammalian':'number of conserved ESTs'
    B<REQUISIT:> the UCSC tool C<liftOver> in the working directory

=head2 B<search_homologous_sequences_in_other_mammalians_with_ucsc_maf>

Searchs homologous sequences in several mammalians to the conserved ESTs. It uses available multiple alignments of the UCSC Download Server (ftp://hgdownload.cse.ucsc.edu/goldenPath).
    Input  - a reference to a hash with several entries
                 key: the name of the query organism
                 value: a text file including per line the path of one maf-file
             the minimum ratio of bases that must remap (recommended value is 0.6)
    Output - hash with pairs of 'name of subject mammalian':'number of conserved ESTs'
    B<REQUISIT:> the perl script C<scan_maf.pl> in the working directory

=head2 B<search_rna_genes>

Executes clustalw to get alignments of the candidate ESTs and executes RNAz to predict novel non-coding RNAs.
    Output - number of novel non-coding RNAs.

=head1 NECESSARY PERL MODULES

=over 6

=item B<EST2ncRNA::MysqlInterface>

Class. Connects automatically, creates and maintenance a mysql database including the data of the pipeline.

=item B<EST2ncRNA::ServerInterface>

Class. Runs and maintenance processes on the remote system.

=item B<EST2ncRNA::RavennaInterface>

Class. Runs RaveNnA.

=item B<EST2ncRNA::SequenceInterface>

Provides several information extraction tools for genomic data files.

=item B<EST2ncRNA::BlastInterface>

Class. Runs BLAST.

=item B<EST2ncRNA::UCSCInterface>

Class. Extracts information from several UCSC file formats, f.e. I<over.chain>, I<maf>.

=item B<EST2ncRNA::ClustalwInterface>

Class. Runs Clustalw.

=item B<EST2ncRNA::RNAzInterface>

Class. Runs RNAz.

=back

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.

=cut
