#!/usr/bin/perl -w
#
# est2ncrna.pl
# preconditions to execute the script are a mysql access and a created database

use Getopt::Long;
use Pod::Usage;

use EST2ncRNA::Pipeline;



#####################################################
# get options
#####################################################

my $locally = '';
my $only_read_mysql = 0; 	# as default the pipeline writes the results in the db, if -only_read_mysql is set it write to stdout
my $estFile = '';
my $queryFile = 0;
my $rejectSmallEST = '';
my $rejectORF = '';
my @blastxResults = ();
my @peptidDB = ();
my @annoNcRNAdb = ();
my $rejectKnownNcRNA = '';
my $RfamModel = '';
my $RfamSeed = '';
my %rejectNotConservedEST = ();
my $evalue = 1e-20;
my $length = 100;
my $identity = 75;
my $advancedBlastn = '';
my %ucscPairwiseAlign = ();
my %ucscMultipleAlign = ();
my $alignSeq = '';
my @predictNovelNcRNA = ();
my $shuffledAln = 0;
my $annotRNA = '';
my @annoSubseqNcRNAdb = ();
my $annoMicroRNA = '';
my $annoSnoRNA = '';
my %annoUTRs = ();
my %annoKnownAlign = ();
my $man = 0;
my $help = 0;

GetOptions('locally' => \$locally,
	   'only_read_mysql' => \$only_read_mysql,
	   'estFile=s' => \$estFile,
	   'queryFile=s' => \$queryFile,
           'rejectSmallEST=i' => \$rejectSmallEST,
           'rejectORF' => \$rejectORF,
	   'blastxResults=s' => \@blastxResults,
	   'peptidDB=s' => \@peptidDB,
	   'annoNcRNAdb=s' => \@annoNcRNAdb,
           'rejectKnownNcRNA' => \$rejectKnownNcRNA,
	   'RfamModel=s' => \$RfamModel,
	   'RfamSeed=s' => \$RfamSeed,
           'rejectNotConservedEST=s' => \%rejectNotConservedEST,
           'advancedBlastn=s' => \$advancedBlastn,
	   'evalue=s' => \$evalue,
	   'length=s' => \$length,
	   'identity=s' => \$identity,
           'ucscPairwiseAlign=s' => \%ucscPairwiseAlign,
           'ucscMultipleAlign=s' => \%ucscMultipleAlign,
	   'alignSeq' => \$alignSeq,
           'predictNovelNcRNA=s' => \@predictNovelNcRNA,
	   'shuffledAln' => \$shuffledAln,
	   'annotRNA' => \$annotRNA,
	   'annoSubseqNcRNAdb=s' => \@annoSubseqNcRNAdb,
	   'annoMicroRNA' => \$annoMicroRNA,
	   'annoSnoRNA' => \$annoSnoRNA,
	   'annoUTRs=s' => \%annoUTRs,
	   'annoKnownAlign=s' => \%annoKnownAlign,
           'help|?' => \$help,
           'man' => \$man
) or pod2usage(2);

pod2usage(1) if $help;
pod2usage(-verbose => 2) if $man;

@blastxResults = split(/,/,join(',',@blastxResults));
@peptidDB = split(/,/,join(',',@peptidDB));

######################################################
# user settings
######################################################

my (%config, @line);
open IN, "est2ncrna.config" || die("Can not open the file!\n");
while( <IN> ) {
	next if $_=~/^#/;
	@line = split " ";
	next if $#line == -1;
	$line[1] = "" unless defined $line[1];
	$config{uc $line[0]} = $line[1];
}
close IN;

# database description
my $DATA_SOURCE = $config{"DATA_SOURCE"};
my $USERNAME = $config{"USERNAME"};
my $PASSWORD = $config{"PASSWORD"};

# server configuration
my $SERVER = $config{"SERVER"};
my $QUEUE = $config{"QUEUE"};
my $NODES = $config{"NODES"};
my $WALLT = $config{"WALLT"};

# it is valid local and on server
my $WORKDIR = $config{"WORKDIR"};

# software/script location on the server
my $PBSUBMIT_HOME = $config{"PBSUBMIT_HOME"};
my $FORMATDB_HOME = $config{"FORMATDB_HOME"};
my $RAVENNA_HOME = $config{"RAVENNA_HOME"};
my $BLAST_HOME = $config{"BLAST_HOME"};
my $CLUSTALW_HOME = $config{"CLUSTALW_HOME"};
my $RNAZ_HOME = $config{"RNAZ_HOME"};
my $RNAZSCRIPTS_HOME = $config{"RNAZSCRIPTS_HOME"};
my $RNAMICRO_HOME = $config{"RNAMICRO_HOME"};
my $RNAMICRO_MODELDIR = $config{"RNAMICRO_MODELDIR"};
my $SNOREPORT_HOME = $config{"SNOREPORT_HOME"};
my $SNOREPORT_MODELDIR = $config{"SNOREPORT_MODELDIR"};


######################################################
# pipeline
######################################################

my ($res, $refreshash, $mam, $refmaffiles);

# create EST2ncRNA::Pipeline class instance
my $pipeline = eval { new EST2ncRNA::Pipeline; } or die ($@);
$pipeline->only_read_mysql($only_read_mysql) if $only_read_mysql;
$pipeline->workdir($WORKDIR) if $WORKDIR;

# create EST2ncRNA::MysqlInterface class instance as attribute of object $pipeline
$pipeline->mysql($DATA_SOURCE, $USERNAME, $PASSWORD);

# create EST2ncRNA::ServerInterface class instance as attribute of object $pipeline
$pipeline->server($SERVER) unless $locally;
$pipeline->server("localhost") if $locally;
$pipeline->server->locally(1) if $locally;
$pipeline->server->pbsubmit($PBSUBMIT_HOME);
$pipeline->server->queue($QUEUE) if $QUEUE ne "";
$pipeline->server->nodes($NODES) if $NODES ne "";
$pipeline->server->wallt($WALLT) if $WALLT ne "";

print "This project runs with the identifier \"".$pipeline->uid."\"\n";

eval {

    if( $estFile ) {
        print "Prepare the db.\n";
        $res = $pipeline->store_ass_ests($estFile);
        print "Store $res assembled ESTs in the db.\n";
    }
    if( $rejectSmallEST ) {
        $res = $pipeline->reject_small_ests($rejectSmallEST);
        print "Sort out $res small ESTs (length shorter as $rejectSmallEST bases).\n";
    }
    if( $rejectORF ) {
        $res = $pipeline->reject_coding_rnas(\@blastxResults, \@peptidDB);
        print "Sort out $res coding RNAs.\n";
    }
    if( $#annoNcRNAdb > -1 ) {
	$refreshash = $pipeline->annotate_using_ncRNA_db($queryFile, \@annoNcRNAdb, $FORMATDB_HOME, $BLAST_HOME);
	foreach( keys %$refreshash ) {
	    print "Found $$refreshash{$_} similar items to the ESTs in the ncRNA database $_.\n"; 
	}
    }
    if( $rejectKnownNcRNA ) {
        $res = $pipeline->reject_known_ncrnas($RAVENNA_HOME, $RfamModel, $RfamSeed);
        print "Sort out $res known ncRNAs.\n";
    }
    if( keys %rejectNotConservedEST > 0 ) {
        $res = $pipeline->find_conserved_rna_in_closed_organism($queryFile, \%rejectNotConservedEST, $advancedBlastn, $FORMATDB_HOME, $BLAST_HOME, $evalue, $length, $identity);
        print "Sort out $res ESTs which are not conserved in the near related organism.\n";
    }
    if( keys %ucscPairwiseAlign > 0 ) {
        $refreshash = $pipeline->search_homologous_sequences_in_other_mammalians_with_ucsc_over_chain($queryFile, \%ucscPairwiseAlign, 0.8);
        foreach $mam ( keys %$refreshash ) {
	    print "$$refreshash{$mam} conserved ESTs have homologous sequences in $mam.\n";
        }
    }
    if( keys %ucscMultipleAlign > 0 ) {
        $refreshash = $pipeline->search_homologous_sequences_in_other_mammalians_with_ucsc_maf($queryFile, \%ucscMultipleAlign, 0.6, $PBSUBMIT_HOME);
        foreach $mam ( keys %$refreshash ) {
            print "$$refreshash{$mam} conserved ESTs have homologous sequences in $mam.\n";
        }
    }
    if( $alignSeq ) {
	$refmaffiles = $pipeline->align_homologous_sequences();
	print "Created MAF-files are:\n";
	map { print "$_\n" } @$refmaffiles;
    }	
    if( $#predictNovelNcRNA > -1 ) {
        $res = $pipeline->search_rna_genes(\@predictNovelNcRNA, $RNAZ_HOME, $RNAZSCRIPTS_HOME, $shuffledAln);
        print "We found $res candidates of non coding RNAs.\n" unless $shuffledAln;
	print $res if $shuffledAln;
    }
    if( $annotRNA ) {
	$res = $pipeline->annotate_tRNA();
	foreach( @$res ) {
	    print "$_\n";
	}
    }
    if( $#annoSubseqNcRNAdb > -1 ) {
	$refreshash = $pipeline->annotate_subsequences_using_ncRNA_db(\@annoSubseqNcRNAdb, $FORMATDB_HOME, $BLAST_HOME);
        foreach( keys %$refreshash ) {
            print "Found $$refreshash{$_} similar items to the predicted novel ncRNAs in the ncRNA database $_.\n";
        }
    }
    if( $annoMicroRNA ) {
	$res = $pipeline->annotate_microRNA($RNAMICRO_HOME, $RNAMICRO_MODELDIR, $shuffledAln);
	print "$res novel ncRNAs are annotated as microRNA by RNAmicro." unless $shuffledAln;
	print $res if $shuffledAln;
    }
    if( $annoSnoRNA ) {
	$res = $pipeline->annotate_snoRNA($SNOREPORT_HOME, $SNOREPORT_MODELDIR, $shuffledAln);
	print "$res novel ncRNAs are annotated as snoRNA by snowReport." unless $shuffledAln;
        print $res if $shuffledAln;
    }
    if( keys %annoUTRs > 0 ) {
	$res = $pipeline->annotate_UTRs($queryFile, \%annoUTRs);
	foreach( @$res ) {
            print "$_\n";
        }
    }
    if( keys %annoKnownAlign > 0 ) {
	$res = $pipeline->annotate_known_alignments(\%annoKnownAlign, 0.5);
	print "$res ncRNA candidates are homologous to existing structure alignments.\n";
    }

};

if ($@) {
    warn "Transaction aborted because $@";
    # now rollback to undo the incomplete changes
    # but do it in an eval{} as it may also fail
    eval { $pipeline->mysql->dbh->rollback };
    # add other application on-error-clean-up code here
    #$pipeline->server->reset_env;
}

# terminate the connection
$pipeline->mysql->dbh->disconnect;



######################################################
# documentation
######################################################

__END__

=head1 NAME

C<est2ncrna.pl> - Gets candidates of novel RNA genes from EST data.

=head1 SYNOPSIS

est2ncrna.pl [OPTIONS] [FILES]

=head2 RECOMMENDED PIPELINE

    est2ncrna.pl -estFile=<EST-FILE>
    est2ncrna.pl -rejectSmallEST=60
    est2ncrna.pl -rejectORF -blastxResults=<BLASTX-FILE> ...
    est2ncrna.pl -annoNcRNAdb <NCRNA-DB-FILE> ...
    est2ncrna.pl -rejectKnownNcRNA -RfamModel=Rfam.tar.gz -RfamSeed=Rfam.seed
    est2ncrna.pl -rejectNotConservedEST "bosTau2; assembly March 2005"=<BOSTAU2-GENOME-FILE-LIST> -advancedBlastn=UniVec_Core -evalue=1e-20 -length=100 -identity=75
    est2ncrna.pl -ucscPairwiseAlign <MM7-GENOME-FILE-LIST>=bosTau2ToMm7.over.chain.gz ...
    est2ncrna.pl -ucscMultipleAlign bosTau2=<HG17-MAF-FILE-LIST>
    est2ncrna.pl -alignSeq
    est2ncrna.pl -predictNovelNcRNA <MAF-FILES> ...
    est2ncrna.pl -predictNovelNcRNA <MAF-FILES> ... -shuffledAln
    est2ncrna.pl -annotRNA
    est2ncrna.pl -annoSubseqNcRNAdb <NCRNA-DB-FILE> ...
    est2ncrna.pl -annoMicroRNA
    est2ncrna.pl -annoMicroRNA -shuffledAln
    est2ncrna.pl -annoSnoRNA
    est2ncrna.pl -annoSnoRNA -shuffledAln
    est2ncrna.pl -annoUTRs hg17=knownGene.txt.gz
    est2ncrna.pl -annoKnownAlign hg17=washCoords.lst
    est2ncrna.pl -annoNCBI <NCBI-DB-FILE> ...
    est2ncrna.pl -annoMRNAbinding hg17=<MRNA-DB-FILE>

=head1 DESCRIPTION

Establishs a pipeline for prediction of novel non-coding RNAs in a given EST dataset. It manages the objects in the EST2ncRNA framework.

=head1 OPTIONS

The following options are recognized by B<est2ncrna.pl>:

=over 6

=item B<-only_read_mysql>

Reads the input data from mysql-db and writes the output to B<stdout> (default: writes output in database).

=item B<-locally>

Runs the processes on the local machine (default: runs processes on a server specified in the configuration file C<est2ncrna.configure>). ATTENTION: Several processes need large resources.

=item B<-estFile>=I<fasta-file>

Creates the tables of the pipeline or cleans already existed ones. Stores the assembled EST data of the input fasta-file (gzip file are supported) in the tables C<assest> and C<candncrna>.

=item B<-rejectSmallEST>=I<length>

Rejects ESTs from C<candncrna> if shorter as length. The recommended length is B<60 nts>.

=item B<-rejectORF>

Rejects protein coding RNAs from C<candncrna> and stores them in C<codingrna>. At least one of the additional parameters B<-blastxResults> or B<-peptidDB> is necessary.

=item B<-annoNcRNAdb> I<NCRNA-DB-FILE> ...

Annotates the ESTs in C<candncrna> as ncRNAs by BLASTN against ncRNA-dbs (input files). The hits are stored in C<annotatedncrna>.

=item B<-blastxResults>=I<blast.mlevel.gz-file> ...

Predicts all hits in the BLASTX generated input file(s) with a M-level lower 5 as protein coding RNAs.

=item B<-peptidDB>=I<fasta-file> ...

Applies BLASTX against the input fasta-file(s) to generate a blast.mlevel.gz-file.

=item B<-rejectKnownNcRNA>

Rejects known non-coding RNAs from C<candncrna> by RaveNnA and stores them in C<knownncrna>. The two additional parameters B<-RfamModel> and B<-RfamSeed> are necessary.

=item B<-RfamModel>=I<model-file>

Rfam models file. (Download: ftp://ftp.sanger.ac.uk/pub/databases/Rfam/Rfam.tar.gz)

=item B<-RfamSeed>=I<stockholm-file>

Rfam annotated seed alignments in STOCKHOLM format. (Download: ftp://ftp.sanger.ac.uk/pub/databases/Rfam/Rfam.seed.gz)

=item B<-rejectNotConservedEST> I<closed_organism release_annotation>=I<chromosome_fasta-file_list>

Rejects not conserved ESTs in a closed related organism from C<candidate> and stores them in C<origspecest> and stores conserved ESTs in C<estcoverage>. A key=value pair as input is necessary whereby the key is the closed organism name (first word) and its release annotation (f.e. "bosTau2; assembly March 2005") and the value is a text file including per line the chromosome (or scaffold) number and the path of one chromosome file which is in FASTA format of the near related organism separated by gaps (f.e. chr10 /home/user/ich/cattle.chr10.fasta.gz). As default it executes blastn with standard parameters, alternatively more specific blastn parameters will be applied with the additional parameter B<-advancedBlastn>. All blast hits are stored in table C<blastn> with an unique ID and the best hits of each query are stored in table C<estcoverage>, if they conform to the selection criterias. The subject and the blast approach are stored in table C<blastnannotation>. 

=item B<-advancedBlastn>=I<UniVec_Core>

Runs BLASTN with parameters recommended by o'reilly blast p.137 for annotating genomic DNA with ESTs. As argument the UniVec_Core file for BLASTN vector filtering is needed. (Download: ftp://ftp.ncbi.nih.gov/pub/UniVec/UniVec_Core)

=item B<-evalue>=I<BlastHit Evalue>

Optional. Specifies the maximal expectation value that is stored by the blast search. Default value = 1e-20

=item B<-length>=I<BlastHit length>

Optional. Specifies the minimal length of blast hits that are stored as EST homologs. Default value = 100 nucleotides

=item B<-identity>=I<BlastHit identity>

Optional. Specifies the minimal identity of blast hits that are stored as EST homologs. Default value = 75 percent

=item B<-ucscPairwiseAlign> I<list_of_subject_chromosome_fasta-files>=I<over.chain-file> ...

Searchs homologous sequences to the conserved ESTs in the subject mammalians using available pairwise alignments in the OVER.CHAIN format of the UCSC Download Server. The hits will be stored in C<estcoverage>. As argument at least one key=value pair is necessary whereby the key is a text file including per line the path of one subject chromosome file which is in FASTA format and the value is the over.chain-file. (Download: ftp://hgdownload.cse.ucsc.edu/goldenPath)

=item B<-ucscMultipleAlign> I<closed_organism>=I<list_of_maf-files> ...

Searchs homologous sequences to the conserved ESTs in several subject mammalians using available multiple alignments in the MAF format of the UCSC Download Server. The hits will be stored in C<estcoverage>. As argument at least one key=value pair is necessary whereby the key is the closed organism name and the value is a text file including per line the path of a multiple alignment maf-file per one chromosome. (Download: ftp://hgdownload.cse.ucsc.edu/goldenPath)

=item B<-alignSeq>

Creates multiple alignments with CLUSTALW of the ESTs in C<candncrna> and their homologous sequences in C<estcoverage>. All the alignments are collected in MAF-files.

=item B<-predictNovelNcRNA> I<MAF-FILES> ...

Predicts novel candidates of non-coding RNAs with RNAz using the Input MAF-files. Updates C<candncrna> with the final ncRNA candidates and inserts EST windows with conserved secondary structures in C<rnazwindows>.

=item B<-shuffledAln>

Performs mononucleotide shuffling of input data. It can be used with B<-predictNovelNcRNA>, B<-annoMicroRNA> and B<-annoSnoRNA> to verify their results.

=item B<-annotRNA>

Runs tRNAscan_SE to annotate tRNAs in the ncRNA candidates predicted by RNAz and RaveNnA.

=item B<-annoSubseqNcRNAdb> I<NCRNA-DB-FILE> ...

Annotates the EST loci (subsequences) in C<candncrna> as ncRNAs by BLASTN against ncRNA-dbs (input files) with relaxed criteria. The annotation is stored in C<candncrna>.

=item B<-annoMicroRNA>

Annotates microRNAs by RNAmicro.

=item B<-annoSnoRNA>

Annotates snoRNAs by SNOREPORT.

=item B<-annoUTRs> I<organism_name>=I<known_genes_file>

Annotates the predicted non-coding RNAs as 5'-UTRs and 3'-UTRs. Therefore it uses the alignment to an homologous organism (f.e. human) and look for UTR annotations. As argument a key=value pair is needed whereby the key is the name of the homologous organism and the value is a text-file annotating all known genes of the homologous organism with coding region and exon description. (Download: ftp://hgdownload.cse.ucsc.edu/goldenPath/hg17/database/knownGene.txt.gz)

=item B<-annoKnownAlign> I<organism_name>=I<alignment_list>

Compares ncRNA candidates with available structure alignments of a related organism. Input is a pair of reference organism name and alignment list.

=item B<-annoNCBI> I<NCBI-DB-FILE> ...

Annotates EST loci (subsequences) in C<candncrna> by NCBI databases and writes the hits in C<ncbiannotation>.

=item B<-annoMRNAbinding> I<organism_name>=I<MRNA-DB-FILE>

Searchs complementary bindings of ESTs to related mRNAs. Therefore, EST homologous sequences of an aligned organism are blasted against its mRNA database with relaxed criteria. RNAdublex predicts possible hybridization sites of the hits to the mRNAs.

=item B<-help>|B<?>

Manual.

=back

=head1 CONFIGURATION FILE

The file C<est2ncrna.configure> should be adapted by the user to specify mysql-database, server, full paths of software. The file lies in same folder like C<est2ncrna.pl> and each line specifies one key-value pair seperated by space.

=head1 NECESSARY PERL MODULE

It needs the B<EST2ncRNA> framework consisting of 9 classes and another module.

=over 6

=item B<EST2ncRNA::Pipeline>

Class. Controls steps of the EST to ncRNA pipeline. Has as attribute a MysqlInterface object.

=item B<EST2ncRNA::MysqlInterface>

Class. Connects automatically, creates and maintenance a mysql database including the data of the pipeline.

=item B<EST2ncRNA::ServerInterface>

Class. Runs and maintenance processes on the remote system.

=item B<EST2ncRNA::RavennaInterface>

Class. Runs RaveNnA.

=item B<EST2ncRNA::BlastInterface>

Class. Runs BLAST.

=item B<EST2ncRNA::UCSCInterface>

Class. Extracts information from several UCSC file formats, f.e. I<over.chain>, I<maf>.

=item B<EST2ncRNA::ClustalwInterface>

Class. Runs CLUSTALW.

=item B<EST2ncRNA::RNAzInterface>

Class. Runs RNAz.

=item B<EST2ncRNA::SequenceInterface>

Provides several information extraction tools for genomic data files.

=back

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 SEE ALSO

RNAz(1) RNAalifold(1) RNAfold(1) Ravenna(1) blastall(1) tRNAscan-SE(1) RNAmicro(1) RNAduplex(1)

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.

=cut
