package EST2ncRNA::MysqlInterfaceproteinupdate;

use strict;
use warnings;
use DBI;

use EST2ncRNA::SequenceInterface qw(get_subject_subsequences_of_one_chromosome);
require Exporter;

our @ISA = qw(Exporter);  # inherits from Exporter

# Items to export into callers namespace by default. Note: do not export
# names by default without a very good reason. Use EXPORT_OK instead.
# Do not simply export all your public functions/methods/constants.

our @EXPORT_OK = qw(prepare_db
		    );

our @EXPORT = qw();

our $VERSION = '0.01';


# constructor
sub new {
    my $class = shift @_;
    my ($datasource, $username, $password) = @_;

    my $self = {
	_datasource => $datasource,
	_username   => $username,
	_password   => $password,
	_dbh        => undef
    };
    bless $self, $class;

    return $self;
}


# accessor method for MysqlInterface datasource
sub datasource {
    my ($self, $datasource) = @_;
    $self->{_datasource} = $datasource if defined($datasource);
    return $self->{_datasource};
}


# accessor method for MysqlInterface username
sub username {
    my ($self, $username) = @_;
    $self->{_username} = $username if defined($username);
    return $self->{_username};
}


# accessor method for MysqlInterface password
sub password {
    my ($self, $password) = @_;
    $self->{_password} = $password if defined($password);
    return $self->{_password};
}


# accessor method for MysqlInterface dbh
sub dbh {
    my ($self) = @_;
    # establish database connection
    if( !defined($self->{_dbh}) || !$self->{_dbh}->ping==1 ) {
	$self->connect_database;
    }
    return $self->{_dbh};
}


# connect the database and set the MysqlInterface dbh
sub connect_database {
    my ($self) = @_;

    $self->{_dbh} = DBI->connect($self->datasource, $self->username, $self->password) or die("Can't connect to $self->datasource: $DBI::errstr");
    $self->{_dbh}->{RaiseError} = 1;   # DBI will automatically die if any DBI method call fails
    $self->{_dbh}->{AutoCommit} = 0;   # enable transaction, if possible

    print "Connection with the database ", $self->datasource, " is established.\n";
}


# delete all entries of all existing tables in the database
# and creates the following tables if not already exist
#   assest        ...  saves the assembled EST data (pipieline input)
#   candncrna     ...  holds during the whole pipeline the actual candidates of ncRNA
#   codingrna     ...  saves protein coding RNAs (including ORFs)
#   knownncrna    ...  saves known ncRNAs
#   origspecest   ...  saves ESTs being not conserved in the closed related organism
#   blastn        ...  saves blastn results
#   conservedest  ...  saves ESTs being conserved in the closed related organism
#   estcoverage   ...  saves all aligned sequences of other mammalians to the ESTs
#   rnazwindows   ...  saves all windows with locally conserved secondary structures of the candncrna's
#   microrna      ...  saves EST locations predicted as microRNAs by RNAmicro
#
# input is the database handler
sub prepare_db {
    my ($self) = @_;
    my ($newtable, $droptable);

    # create new tables if not already exists
    $newtable = q{create table if not exists assest (ass_id int not null primary key, ass_name varchar(100), ass_sequence text)};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists alignversion (align_id int not null, align_organism varchar(20), primary key(align_id, align_organism))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists candncrna (cand_id int not null, cand_start int, cand_end int, cand_pvalue float, cand_locus int, cand_annotation varchar(200), cand_utr varchar(50), cand_version int, primary key(cand_id, cand_start), foreign key(cand_version) references alignversion(align_id) on delete set null)};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists codingrna (coding_id int not null primary key, coding_mlevel int, coding_protein varchar(100))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists annotatedncrna (anno_id int not null, anno_start int, anno_end int, anno_subject_id varchar(100), anno_evalue float, anno_identity float, anno_subject_coverage float, anno_description varchar(100), primary key(anno_id, anno_subject_id))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists knownncrna (known_id int not null, known_start int not null, known_end int not null, known_score float, known_family varchar(7), known_basepairs int, known_status varchar(5), known_annotation varchar(200), primary key(known_id, known_start, known_end))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists origspecest (orig_id int not null primary key)};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists blastn (blastn_query_id int not null, blastn_subject_id varchar(50) not null, blastn_bit_score float, blastn_score int, blastn_align_length int, blastn_identity int, blastn_evalue float, blastn_query_start int, blastn_query_stop int, blastn_subject_start int, blastn_subject_stop int, blastn_query_gaps int, blastn_subject_gaps int, primary key(blastn_query_id, blastn_subject_id, blastn_query_start, blastn_query_stop, blastn_subject_start, blastn_subject_stop))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists conservedest (cons_id int not null, cons_start int, cons_end int, primary key(cons_id, cons_start))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists estcoverage (estcov_source_id int not null, estcov_source_start int, estcov_subject_name varchar(50), estcov_subject_start int, estcov_subject_end int, estcov_subject_sequence text, estcov_align_type varchar(20), primary key(estcov_source_id, estcov_source_start, estcov_subject_name, estcov_align_type))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists rnazwindows (rnaz_locus int, rnaz_window int not null primary key, rnaz_start int, rnaz_end int, rnaz_strand varchar(1), rnaz_pvalue float, rnaz_zscore float, rnaz_sci float, rnaz_combPerPair float, rnaz_align_identity float, rnaz_sequence text, rnaz_cons_structure text, rnaz_align_organisms int, rnaz_align_organisms_list text, foreign key(rnaz_align_organisms) references alignversion(align_id) on delete set null)};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists microrna (micro_id int, micro_start int, micro_end int, micro_strand varchar(1), micro_pvalue float, primary key(micro_id, micro_start, micro_strand))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists snorna (sno_id int not null, sno_strand varchar(1), sno_pvalue float, primary key(sno_id, sno_strand))};
    $self->dbh->do($newtable);
    $newtable = q{create table if not exists ncbiannotation (ncbi_id int not null, ncbi_start int, ncbi_query_start int, ncbi_query_end int, ncbi_subject_id varchar(100), ncbi_evalue float, ncbi_identity float, ncbi_est_coverage float, ncbi_description varchar(100), primary key(ncbi_id, ncbi_start, ncbi_subject_id))};
    $self->dbh->do($newtable);

    # delete all entries of all existing tables
    $droptable = q{delete from assest};
    $self->dbh->do($droptable);
    $droptable = q{delete from alignversion};
    $self->dbh->do($droptable);
    $droptable = q{delete from candncrna};
    $self->dbh->do($droptable);
    $droptable = q{delete from codingrna};
    $self->dbh->do($droptable);
    $droptable = q{delete from annotatedncrna};
    $self->dbh->do($droptable);
    $droptable = q{delete from knownncrna};
    $self->dbh->do($droptable);
    $droptable = q{delete from origspecest};
    $self->dbh->do($droptable);
    $droptable = q{delete from blastn};
    $self->dbh->do($droptable);
    $droptable = q{delete from conservedest};
    $self->dbh->do($droptable);
    $droptable = q{delete from estcoverage};
    $self->dbh->do($droptable);
    $droptable = q{delete from rnazwindows};
    $self->dbh->do($droptable);
    $droptable = q{delete from microrna};
    $self->dbh->do($droptable);
    $droptable = q{delete from snorna};
    $self->dbh->do($droptable);
    $droptable = q{delete from ncbiannotation};
    $self->dbh->do($droptable);

    $self->dbh->commit;
}


# stores the input assembled EST data in a table names 'assest'
# and their indices in the 'candncrna'-table
#
# input is the assembled EST data in FASTA format
# assisted are two formats: '><unique id> <name> ...\n<sequence>' and '><name> ...\n<sequence>'; in the second format a new unique id is created automatically
# returns the number of inserted ass ests
sub insert_assest {
    my $self = shift;
    my $file = shift;
    my $tmpfile = $file;
    my ($id, $name, $seq, $newassest, $newcandncrna, $sth2, $sth3, $line, $form);
    my ($searchid, $sth1, @count);
    my $nr = 0;

    # extracts the data from the fasta-file and writes them in the 'assest'-table
    if ($file =~ /\.gz$/) {
        open FH, "gunzip -c $file |" || die("Can not open the file!\n");
    }
    else {
        open FH, $file || die("Can not open the file!\n");
    }

    $newassest = q{insert into assest values (?,?,?)};
    $sth2 = $self->dbh->prepare($newassest);
    $newcandncrna = q{insert into candncrna (cand_id, cand_start, cand_end) values (?, 1, ?)};
    $sth3 = $self->dbh->prepare($newcandncrna);

    # check the format at the first line (0 if without unique index, 1 if with unique index)
    $line = <FH>;
    ($id, $name) = $line =~ /^>(\S+)\s+(\S+)/;
    $form = 0;
    $form = 1 if $id=~/^\d+$/;
    $name = $id unless $form;

    while( $line = <FH> ) {
        if( $line =~ /^>(\S+)\s+(\S+)/ ) {
            if( defined $seq ) {
                $nr++;
                $id = $nr+1000000000 unless $form;
                $sth2->execute($id,$name,$seq);
                $sth3->execute($id,length($seq));
                $seq = "";
            }
            $id = $1 if $form;
            $name = $2 if $form;
            $name = $1 unless $form;
        }
        else {
            $seq = $seq.$1 if $line =~ /^(\D+)\s$/;
        }
    }
    # last entry
    $nr++;
    $id = $nr+1000000000 unless $form;
    $sth2->execute($id,$name,$seq);
    $sth3->execute($id,length($seq));

    close FH;

    $self->dbh->commit;

    return $nr;
}


# deletes from the candncrna-table the small ests with a length shorter as the input (recommended is 60nts)
#
# input is the unusable length
# returns the number of small ests
sub delete_small_ests_from_candncrna {
    my $self = shift;
    my $length = shift;
    my ($query, $sth1, $sth2, $id);
    my $nr = 0;

    # searchs the ids of ests being shorter as $length and binding them to $id
    $query = q{select cand_id from candncrna inner join assest where cand_id=ass_id and length(ass_sequence)<?};
    $sth1 = $self->dbh->prepare($query);
    $sth1->execute($length);
    $sth1->bind_columns(\$id);

    # deletes the small ests from the candidate table
    $query = q{delete from candncrna where cand_id=?};
    $sth2 = $self->dbh->prepare($query);
    while( $sth1->fetch ) {
        $sth2->execute($id);
        $nr++;
    }
    $self->dbh->commit;

    return $nr;
}


# stores all blastx results in the codingrna-table with their M-level
# and predicts EST hits with M-level 0 to 4 as protein coding RNAs (deletes them from the candncrna-table)
#
# input are a reference to an array with all available blastx-result files
# returns the number of coding RNAs
sub insert_codingrna {
    my $self = shift;
    my $blastxres_ref = shift;

    my (@line, @el, @count, $level);
    my ($getassest, $getcodingrna, $newcodingrna, $deletecand, $deletecoding, $sth0, $sth1, $sth2, $sth3, $sth4);

    # prepare db statements
    $getassest = q{select * from assest where ass_id=?};
    $sth0 = $self->dbh->prepare($getassest);
    $getcodingrna = q{select * from codingrna where coding_id=?};
    $sth1 = $self->dbh->prepare($getcodingrna);
    $newcodingrna = q{insert into codingrna values (?,?,?)};
    $sth2 = $self->dbh->prepare($newcodingrna);
    $deletecand = q{delete from candncrna where cand_id=?};
    $sth3 = $self->dbh->prepare($deletecand);
    $deletecoding = q{delete from codingrna where coding_id=?};
    $sth4 = $self->dbh->prepare($deletecoding);

    # read BLASTX results and write them in the db
    foreach( @$blastxres_ref ) {
        @line = `zcat $_ | grep -w M[0-6]`;

        foreach( @line ) {
            @el = split " ", $_;
	    ( $level ) = $el[1] =~ /M(\d)/;
	    
	    # control if coding RNA is part of input EST data
	    $sth0->execute($el[2]);
	    @count = $sth0->fetchrow_array;
	    next unless $count[0];

	    # if coding RNA is already stored then delete it if the actual M-level is lower
	    $sth1->execute($el[2]);
	    @count = $sth1->fetchrow_array;
	    if( $count[0] ) {
		next if $count[1]<=$level;
		$sth4->execute($el[2]);
	    }
	    
	    # store new coding RNA and delete it from 'candncrna'-table
	    $sth2->execute($el[2], $level, $el[3]);
	    $sth3->execute($el[2]) if $level<5;
        }
    }

    # get number of protein coding RNAs (return value)
    $getcodingrna = q{select count(*) from codingrna where coding_mlevel<5};
    $sth1 = $self->dbh->prepare($getcodingrna);
    $sth1->execute();
    @count = $sth1->fetchrow_array;

    $self->dbh->commit;

    return $count[0];
}


# creates a gzipped sequence fasta file of the sequences of the input statement
#
# input are the mysql-statement (returns four arguments: id, start, end, sequence),
#           a boolean if subsequences should be extracted ("1", otherwise "0") and poly-A tails should be cutted ("2")
#           the output fasta-file
#	    a boolean if sequence name should be sequence id and start seperated by ":" (optional)
# returns the number of candidate ncRNAs
sub get_statement_fasta {
    my $self = shift;
    my $std = shift;
    my $subseq = shift;
    my $file = shift;
    my $seqname = shift;
    my ($sth, $id, $start, $stop, $seq, $s, $e, $rate, $revseq);
    my $nr = 0;

    $sth = $self->dbh->prepare($std);
    $sth->execute();
    $sth->bind_columns(\$id, \$start, \$stop, \$seq);

    # create sequence fasta file
    print "Create sequence fasta file.\n";
    open OUT, "| gzip > $file" || die("Can not open the file!\n");
    while( $sth->fetch ) {

	#get subsequence
	$seq = $self->get_assest_subsequence($id, $start, $stop) if $subseq==1;

	#remove poly-A tail (as a sequence of As or Ts in the beginning or ending of the sequence if the length greater 9nts)
	if( $subseq == 2 ) {
		#remove at 5'-end poly-As or poly-Ts with length>10nt and (A|T)-rate>80%
		$seq =~ /([Aa]{10,}|[Tt]{10,})/;
		if( defined $1 ) {
			$s=index($seq,$1);
			$e=$s+length($1);
			$rate=length($1)*100/$e;
			$seq=substr($seq, $e, length($seq)) if $rate>80;
		}
		#remove at 3'-end poly-As or poly-Ts with length>10nt and (A|T)-rate>80%
		$revseq = reverse($seq);
		$revseq =~ /([Aa]{10,}|[Tt]{10,})/;
		if( defined $1 ) {
			$s=index($revseq,$1);
			$e=$s+length($1);
			$rate=length($1)*100/$e;
			$seq=substr($seq, 0, length($seq)-$e) if $rate>80;
		}
	}

        #create fasta file
	$id = ( defined $seqname ) ? "$id:$start" : $id;
	print OUT ">$id\n$seq\n";
        $nr++;
    }
    close OUT;

    return $nr;
}


# create sequence fasta file (.gz) including all candidate ncRNAs from the mySQL-db
#
# returns the number of candidate ncRNAs
sub get_candncrna_fasta {
    my $self = shift;
    my $file = shift;
    my ($query, $nr);

    # fetch the candidate ncRNAs
    $query = q{select assest.ass_id, candncrna.cand_start, candncrna.cand_end, assest.ass_sequence from assest inner join candncrna where cand_id=ass_id};

    # creates the fasta-file of the total sequences
    $nr = $self->get_statement_fasta($query, 0, $file);
    
    return $nr;
}


# create sequence fasta file (.gz) including all candidate ncRNAs from the mySQL-db whereby the poly-A tails of the ESTs are deleted
#
# returns the number of candidate ncRNAs
sub get_candncrna_fasta_without_polyA {
    my $self = shift;
    my $file = shift;
    my ($query, $nr);

    # fetch the candidate ncRNAs
    $query = q{select assest.ass_id, candncrna.cand_start, candncrna.cand_end, assest.ass_sequence from assest inner join candncrna where cand_id=ass_id};

    # creates the fasta-file of the total sequences
    $nr = $self->get_statement_fasta($query, 2, $file);

    return $nr;
}


# for all closed organism chromosomes in @chr creates a sequence fasta file with candidate ncRNAs which hit it
#
# returns a reference to an array with all query files
sub get_candncrna_fasta_from_blasthits {
    my $self = shift;
    my $refchr = shift;
    my $SUBDIR = shift;
    my $blasttable = shift;
    my ($chrfile, $chr, $QUERYFILE, @advquery, @line, %idhash, $query, $sth, $id, $seq);

    foreach $chrfile ( @$refchr ) {
	# create file names
        ( $chr ) = $chrfile =~ /(\w+).fa.gz$/;
        $QUERYFILE = "query".$chr.".fa.gz";
	push @advquery, $QUERYFILE;

	# get all chromosome related ESTs (use a hash to get a distinct list)
        open IN, "zcat $SUBDIR/$blasttable |" || die("Can not open the file!\n");
        foreach( <IN> ) {
            @line = split " ";
            $idhash{$line[0]} = 1 if index($line[1],"$chr")>=0;
        }
        close IN;

	# create advanced blast query file in fasta format
	print "Create sequence fasta file of all standard aligned candidate ESTs on $chr.\n";
        #$query = qq{select ass_id, ass_sequence from assest, candncrna where ass_id=cand_id group by ass_id};
	$query = qq{select ass_id, ass_sequence from assest};
        $sth = $self->dbh->prepare($query);
        $sth->execute();
        $sth->bind_columns(\$id, \$seq);

        open OUT, "| gzip > $SUBDIR/$QUERYFILE" || die("Can not open the file!\n");
        while( $sth->fetch ) {
            #create fasta file entry for chromosome related ESTs
            print OUT ">$id\n$seq\n" if defined $idhash{$id};
        }
        close OUT;
    }

    return \@advquery;
}

sub insert_knownncrna {
	my $self = shift;
	my $knownncrna = shift;

	my ($searchid, $newknownncrna, $deletecand, $sth1, $sth2, $sth3);
	my ($hit, @count);
	my $nr = 0;

	# prepare the input-statement of the 'knownncrna'-table
    	$searchid = q{select count(*) from knownncrna where known_id=?};
    	$sth1 = $self->dbh->prepare($searchid);
    	$newknownncrna = q{insert into knownncrna (known_id,known_start,known_end,known_score,known_family,known_basepairs,known_status) values (?,?,?,?,?,?,?)};
    	$sth2 = $self->dbh->prepare($newknownncrna);

    	# prepare the delete-statement of the 'candncrna'-table
    	$deletecand = q{delete from candncrna where cand_id=?};
    	$sth3 = $self->dbh->prepare($deletecand);

	foreach $hit ( @$knownncrna ) {
		# removes a known ncrna from the 'candncrna'-table if it wasn't already removed and its status is a 'HIT'
                if( ${$hit}[6] eq "HIT" ) {
                    $sth1->execute(${$hit}[0]);
                    @count = $sth1->fetchrow_array;
                    while( $sth1->fetchrow_array ) {}
                    unless( $count[0] ) {
                        $sth3->execute(${$hit}[0]);
                        $nr++;
                    }
                }

                # add every hits to the covariance models in the 'knownncrna'-table
                $sth2->execute(${$hit}[0], ${$hit}[1], ${$hit}[2], ${$hit}[3], ${$hit}[4], ${$hit}[5], ${$hit}[6]);
	}
	$self->dbh->commit;

	return $nr;
}


sub insert_knownncrna2 {
    my $self = shift;
    my $subdir = shift;
    my $refbp = shift;
    my $length = shift;

    my ($searchid, $newknownncrna, $deletecand, $sth1, $sth2, $sth3);
    my ($cvsfiles, $family, @par, $status, $subseq, $sshit, $line, $id, $bp, $modbpr, $index, @count);
    my $nr = 0;

    # prepare the input-statement of the 'knownncrna'-table
    $searchid = q{select count(*) from knownncrna where known_id=?};
    $sth1 = $self->dbh->prepare($searchid);
    $newknownncrna = q{insert into knownncrna (known_id,known_start,known_end,known_score,known_family,known_basepairs,known_status) values (?,?,?,?,?,?,?)};
    $sth2 = $self->dbh->prepare($newknownncrna);

    # prepare the delete-statement of the 'candncrna'-table
    $deletecand = q{delete from candncrna where cand_id=?};
    $sth3 = $self->dbh->prepare($deletecand);

    # analyse the results and update the mySQL-db
    $index = 0;
    while( $cvsfiles = <$subdir/*.cmzasha.csv> ) {
        ( $family ) = $cvsfiles =~ /(RF\d{5})/;

        open IN, "$cvsfiles" || die("Can not open the file!\n");
        while( <IN> ) {
            if( $_ =~ /^Params:/ ) {
                $status = "HIT";
                @par = split ",", $_;

                # test the plausibility of the ravenna hits

                # check if the modeled subsequence has gaps inside (bug of the assembly software of Dr. Mike Gilchrist)
                # ravenna hits including gap regions causing modelling mistakes hence they should be ignored
                $subseq = $self->get_assest_subsequence($par[6],$par[9],$par[10]);
                $status = "GAPS" if $subseq=~/-/;

                # check if the modeled subsequence are longer as $LENGTH nucleotides
                $status = "SMALL" if( $status eq "HIT" && abs($par[10]-$par[9])<=$length );

                # check if the number of basepairs lies in a range of 20% around the basepair number of the model
                if( $status eq "HIT" ) {

                    # catch the dot-bracket notation of the predicted secondary structure from the .cmzasha-file
                    $cvsfiles =~ /^(.*).csv$/;
                    open IN2, "$1" || die("Can not open the file!\n");
                    $sshit=0;
                    foreach $line ( <IN2> ) {
                        ( $id ) = $line =~ /^----sequence:\s\#\d+,(\d+)/;
                        $sshit=1 if defined $id && $id==$par[6];
                        if( $sshit && $line=~/^----ssRNAplot:/ ) {
                            ( $subseq ) = $line =~ /^----ssRNAplot:\s+([\(\.\)]+)/;
                            last;
                        }
                    }
                    close IN2;

                    #$bprate = get_basepair_rate($subseq);
                    $bp = $subseq =~ tr/(//;
                    $modbpr = $$refbp[$index];
                    $status = "BP" if( $bp<=$modbpr-0.2*$modbpr || $bp>=$modbpr+0.2*$modbpr );
                }

                # removes a known ncrna from the 'candncrna'-table if it wasn't already removed and its status is a 'HIT'
                if( $status eq "HIT" ) {
                    $sth1->execute($par[6]);
                    @count = $sth1->fetchrow_array;
		    while( $sth1->fetchrow_array ) {}
                    unless( $count[0] ) {
                        $sth3->execute($par[6]);
                        $nr++;
                    }
                }

                # add every hits to the covariance models in the 'knownncrna'-table
                $sth2->execute($par[6], $par[9], $par[10], $par[11], $family, $bp, $status);
            }
        }
        close IN;
        $index++;
    }
    $self->dbh->commit;

    # delete temp files
    unlink "$subdir/db.fa.gz", "$subdir/ravenna.config.tab", "$subdir/default.heurcreationspec", "$subdir/pbs_ravenna.sh";

    return $nr;
}


# returns the subsequence of an EST
#
# input is the EST id, startindex and endindex
# returns the subsequence
sub get_assest_subsequence {
    my ($self, $id, $start, $end) = @_;
    my ($query, $sth, $seq, $tmp, @seq, $base);
    my $n = 0;
    my $subseq = "";

    $query = q{select ass_sequence from assest where ass_id=?};
    $sth = $self->dbh->prepare($query);
    $sth->execute($id);
    $seq = ( $sth->fetchrow_array ) [0];
    #while( $sth->fetchrow_array ) {}

    if( $start>$end ) {
        $tmp = $start;
        $start = $end;
        $end = $tmp;
    }
    @seq = split //,$seq;
    foreach $base (@seq) {
        $n++;
        if($n>=$start && $n<=$end) {
            $subseq = $subseq.$base;
        }
        elsif($n>$end) {
            last;
        }
    }

    return $subseq;
}


# add subject sequences to estcoverage table
#
# input are subject name,
#           a reference to a hash including as key the subject chromosome name and as value the genome FASTA-file,
#           the alignment type
sub update_estcov_subject_sequence {
    my ($self, $subject, $chr_hashref, $aligntype) = @_;
    my ($getestcov, $updateestcov, $sth1, $sth2);
    my ($chrom, $queries_ary_ref, $sseq_ref, $qid, $qstart);

    print "Get aligned subsequences of $subject.\n";
    $getestcov = q{select estcov_source_id, estcov_source_start, estcov_subject_start, estcov_subject_end from estcoverage where estcov_subject_name=? and  estcov_align_type=? order by estcov_subject_start};
    $updateestcov = q{update estcoverage set estcov_subject_sequence=? where estcov_source_id=? and estcov_source_start=? and estcov_subject_name=? and estcov_align_type=?};
    $sth1 = $self->dbh->prepare($getestcov);
    $sth2 = $self->dbh->prepare($updateestcov);

    foreach $chrom ( keys %$chr_hashref ) {
        $sth1->execute("gnl|$subject|$chrom", $aligntype);
        $queries_ary_ref = $sth1->fetchall_arrayref;
        $sseq_ref = get_subject_subsequences_of_one_chromosome($$chr_hashref{$chrom},$queries_ary_ref);
        foreach( keys %$sseq_ref ) {
            # extract key data from hash
            ( $qid, $qstart ) = $_ =~ /^(\d+)\|(\d+)/;
            # add subsequence to estcoverage table
            $sth2->execute($$sseq_ref{$_}, $qid, $qstart, "gnl|$subject|$chrom", $aligntype);
        }
    }

    $self->dbh->commit;
}


sub insert_blastn {
    my ($self, $subject, $chr_hashref, $ADVANCEDBLASTN, $SUBDIR) = @_;
    my ($delorg, $newblastn, $getcand, $getblasthits, $updatecandncrna, $newconservedest, $aligntype, $newestcoverage, $deletecandncrna, $neworigspecest);
    my ($sth1, $sth2, $sth3, $sth4, $sth5, $sth6, $sth7);
    my ($blasttable, @par, $qid, %range, $status, $assid, $qstart, $qstop, $sid, $sstart, $sstop, $e, $start);
    my $nr = 0;

    # prepare the input-statement of the 'blastn'-table
#    $delorg = q{delete from blastn where blastn_subject_id like "%$subject%"};
#    $self->dbh->do($delorg);
#    $newblastn = q{insert into blastn values (?,?,?,?,?,?,?,?,?,?,?,?,?)};
#    $sth1 = $self->dbh->prepare($newblastn);

    # use kvlblast2table results to fill in blastn table
#    $blasttable = $ADVANCEDBLASTN ? "query.advanced.blast.table.gz" : "query.blast.table.gz";
#    open IN, "zcat $SUBDIR/$blasttable |" || die("Can not open the file!\n");
#    <IN>;
#    while( <IN> ) {
#        @par = split " ", $_;

        # add every blast hit with e-value<1e-20 to the 'blastn'-table
#        $sth1->execute($par[0], $par[1], $par[4], $par[5], $par[6], $par[7], $par[8], $par[9], $par[10], $par[11], $par[12], $par[13], $par[14]);
#    }
#    close IN;

    # select the best non overlapping blast hits (lowest e-value) with a length>100nt and an identity>75nt for each EST of the 'candncrna'-table
    # if this alignment exists then update the 'candncrna'-table and add the alignment in the 'estcoverage'-table
    # otherwise remove the EST from the 'candncrna'-table and add it to the 'origspecest'-table
#    $getcand = q{select cand_id from candncrna};
$getcand = q{select coding_id from codingrna where coding_mlevel<5};
    $getblasthits = q{select blastn_query_id, blastn_query_start, blastn_query_stop, blastn_subject_id, blastn_subject_start, blastn_subject_stop, blastn_evalue from blastn where blastn_query_id=? and blastn_align_length>100 and blastn_identity>75 order by blastn_evalue};
#    $updatecandncrna = q{insert into candncrna (cand_id, cand_start, cand_end) values (?,?,?)};
    $newconservedest = q{insert into conservedest values (?,?,?)};
#    $aligntype = $ADVANCEDBLASTN ? "advanced_blastn" : "standard_blastn";
$aligntype = "advanced_blastn";
    $newestcoverage = q{insert into estcoverage values (?,?,?,?,?, NULL,?)};
#    $deletecandncrna = q{delete from candncrna};
#    $neworigspecest = q{insert into origspecest values (?)};

    $sth1 = $self->dbh->prepare($getcand);
    $sth1->execute();
    $sth1->bind_columns(\$assid);
    $sth2 = $self->dbh->prepare($getblasthits);
#    $sth3 = $self->dbh->prepare($updatecandncrna);
    $sth4 = $self->dbh->prepare($newestcoverage);
#    $sth5 = $self->dbh->prepare($deletecandncrna);
#    $sth6 = $self->dbh->prepare($neworigspecest);
    $sth7 = $self->dbh->prepare($newconservedest);

    # clean the 'candncrna'-table
#    $sth5->execute();

    while( $sth1->fetch ) {
        $qid = undef;
        %range = ();
        $status = 0;
        $sth2->execute($assid);
        $sth2->bind_columns(\$qid, \$qstart, \$qstop, \$sid, \$sstart, \$sstop, \$e);

        # fetch in loop, if hit don't overlap with previous ones then update candncrna, conservedest and estcoverage, if no entry then origspecest
        while( $sth2->fetch ) {
            foreach $start ( keys %range ) {
                if( $qstart>=$start && $qstart<=$range{$start} ) {
                    $status = 1;
                    last;
                }
                if( $qstop>=$start && $qstop<=$range{$start} ) {
                    $status = 1;
                    last;
                }
            }
            last if $status;
            $range{$qstart} = $qstop;

            # add conserved EST subsequence to the 'candncrna'-table
#            $sth3->execute($qid, $qstart, $qstop);
            # add conserved EST subsequence to the 'conservedest'-table
            $sth7->execute($qid, $qstart, $qstop);
            # add the alignment in the 'estcoverage'-table
            $sth4->execute($qid, $qstart, $sid, $sstart, $sstop, $aligntype);
        }
#        unless( defined $qid ) {
            # add EST to the 'origspecest'-table
#            $sth6->execute($assid);
#            $nr++;
#        }
    }

    $self->dbh->commit;


# catch the names of chromosome files in a hash
open IN, $chr_hashref || die("Can not open the file!\n");
my $line;
my $chrom;
my %chr;
foreach $line ( <IN> ) {
chomp $line;
( $chrom ) = $line =~ /(chr\S+).fa.gz$/;
$chr{$chrom} = $line;
}
$chr_hashref=\%chr;

    # add subject sequences to estcoverage table
    $self->update_estcov_subject_sequence($subject, $chr_hashref, $aligntype);

    # delete temp files
    #unlink "$SUBDIR/query.fa.gz", "$SUBDIR/query.blast.gz", "$SUBDIR/pbs_blastn.sh";

    return $nr;
}


# creates a BED-file including the subsequence coordinates of a query organism in the estcoverage table
#
# input are the name of the query organism and the subdirectory where the output file should be saved
# returns the output file location
sub get_estcoverage_bed {
    my ($self, $queryorg, $SUBDIR) = @_;
    my ($getestcov, $sth1);
    my ($qid, $qstart, $sid, $sstart, $sstop, $strand, $tmp, $start, $end);

    my $OLDFILE = "$SUBDIR/loIn.bed";

    # select all estcoverage data of the query organism from the mySQL-db
    $getestcov = qq{select estcov_source_id,estcov_source_start,estcov_subject_name,estcov_subject_start,estcov_subject_end from estcoverage where estcov_subject_name like "%$queryorg%"};
    $sth1 = $self->dbh->prepare($getestcov);
    $sth1->execute();
    $sth1->bind_columns(\$qid, \$qstart, \$sid, \$sstart, \$sstop);

    # create liftOver input data
    open OUT, ">$OLDFILE" || die("Can not open the file!\n");

    while( $sth1->fetch ) {
        # chromosome name
        $sid =~ s/.*\|(\w+)$/$1/;

        # test type of strand
        if( $sstart<=$sstop ) {
            $strand = "+";
        }
        else {
            $strand = "-";
            $tmp = $sstart;
            $sstart = $sstop;
            $sstop = $tmp;
        }

        # index conversion from blastn-convention to over.chain-convention (UCSC) on strand "+"
        $start = $sstart - 1;
        $end = $sstop;

        # fill $OLDFILE with query subsequence coordinates (chromosome \t start \t end \t sourcename|sourcestart \t 1 \t strand)
        print OUT "$sid\t$start\t$end\t$qid|$qstart\t1\t$strand\n";
    }

    close OUT;

    return $OLDFILE;
}


# creates a file including the estcoverage entries with $queryorg as subject ordered by chromosome and startposition
#
# input are the name of the query organism and the subdirectory where the output file should be saved
# returns the output file location
sub get_estcoverage_for_maf {
    my ($self, $queryorg, $SUBDIR) = @_;
    my ($getestcov, $sth1, $getestcovchr, $sth2);
    my ($qchr, $queries_ary_ref, $query_ary_ref);

    my $OLDFILE = "estcand.txt";

    # select all estcoverage data of the query organism from the mySQL-db
    $getestcovchr = qq{select distinct estcov_subject_name from estcoverage where estcov_subject_name like "%$queryorg%"};
    $sth2 = $self->dbh->prepare($getestcovchr);
    $sth2->execute();
    $sth2->bind_columns(\$qchr);
    $getestcov = qq{select estcov_source_id,estcov_source_start,estcov_subject_name,estcov_subject_start,estcov_subject_end from estcoverage where estcov_subject_name=? order by estcov_subject_start};
    $sth1 = $self->dbh->prepare($getestcov);

    # print the estcoverage data sorted by chromosomes and start index in a file
    open OUT, ">$SUBDIR/$OLDFILE" || die("Can not open the file!\n");
    while( $sth2->fetch ) {

        $sth1->execute($qchr);
        $queries_ary_ref = $sth1->fetchall_arrayref;

        # chromosome name
        $qchr =~ /.*\|(\w+)$/;

        print OUT "#$1\n";
        foreach $query_ary_ref ( @$queries_ary_ref ) {
            print OUT "$$query_ary_ref[0] $$query_ary_ref[1] $$query_ary_ref[2] $$query_ary_ref[3] $$query_ary_ref[4]\n";
        }
    }
    close OUT;

    return $OLDFILE;
}


# insert alignments in the estcoverage table
#
# input are a reference to a hash including pairs "est_id:est_start:subject_name:align_type" = "subject_start:subject_end:(subject_seq)"
#       additional arguments are the subject name and
#       a reference to a hash including pairs "chromosome" = "fasta-file location" if the term $sseq is empty
sub insert_estcoverage {
    my ($self, $refcand, $subject, $chr_hashref) = @_;
    my ($qid, $qstart, $sid, $sstart, $sstop, $seq, $aligntype);
    my ($newestcov, $sth1);

# read output file of liftOver in a hash
open IN, $refcand || die("Can not open the file!\n");
my ($start,$end,$strand,$tmp,%cand);
while( <IN> ) {
 #( $sid, $start, $end, $qid, $qstart, $strand ) = /^(\w+)\s+(\d+)\s+(\d+)\s+(\d+)\|(\d+)\s+1\s+([+-])$/;
 my @l = split " ";
 $sid = $l[0];
 $start = $l[1];
 $end = $l[2];
 $qid = ( split /\|/, $l[3] )[0];
 $qstart = ( split /\|/, $l[3] )[1];
 $strand = $l[5];

 # index conversion from over.chain-convention (UCSC) to blastn-convention (equal to estcoverage-table convention)
 $sstart = $start + 1;
 $sstop = $end;

 # swap coordinates if strand "-"
 if( $strand eq "-" ) {
  $tmp = $sstart;
  $sstart = $sstop;
  $sstop = $tmp;
 }
 # fill hash with source organism vs subject organism pairs
 $cand{"$qid:$qstart:$sid:advanced_blastn"} = "$sstart:$sstop:";
 }
close IN;
$refcand=\%cand;

    # prepare mySQL statement
    $newestcov = q{insert into estcoverage values (?,?,?,?,?,?,?)};
    $sth1 = $self->dbh->prepare($newestcov);

    foreach( keys %$refcand ) {
	# extract data from hash
	( $qid, $qstart, $sid, $aligntype ) = split ":";
	( $sstart, $sstop, $seq ) =  split ":", $$refcand{$_};
	$sid = "gnl|$subject|$sid" if defined $subject;
	$seq = "NULL" if $seq eq "";

	# add the alignment in the 'estcoverage'-table
	$sth1->execute($qid, $qstart, $sid, $sstart, $sstop, $seq, $aligntype);
    }

# catch the names of chromosome files in a hash
open IN, $chr_hashref || die("Can not open the file!\n");
my $line;
my $chrom;
my %chr;
foreach $line ( <IN> ) {
chomp $line;
( $chrom ) = $line =~ /(chr\S+).fa.gz$/;
$chr{$chrom} = $line;
}
close IN;
$chr_hashref=\%chr;
$aligntype="advanced_blastn";

    if( defined $chr_hashref) {
	# add subject sequences to estcoverage table
	$self->update_estcov_subject_sequence($subject, $chr_hashref, $aligntype);
    }

    $self->dbh->commit;
}


# creates fasta-files including one EST and its homologous sequences
#
# input is the output folder
# returns a reference to an array including the name of all fasta-files (without file-ending)
sub get_homologous_fasta {
    my ($self, $subdirfasta) = @_;
    my ($getcandncrna, $getclosedorgan, $getestcoverage, $sth1, $sth2, $sth3);
    my ($cid, $cstart, $cend, $aligntype, $csubseq, @fasta, $sname, $sseq, @name);
    
    # create for each candidate EST from the 'candncrna'-table a fasta file with all related sequences collecting in the 'estcoverage'-table
    $getcandncrna = qq{select estcov_source_id, estcov_source_start, cand_end, estcov_align_type from estcoverage, candncrna where estcov_source_id=cand_id and estcov_source_start=cand_start group by estcov_source_id, estcov_source_start, estcov_align_type};
    $getclosedorgan = q{select estcov_subject_name, estcov_subject_sequence from estcoverage where estcov_source_id=? and estcov_source_start=? and estcov_align_type like "%blastn%"};
    $getestcoverage = q{select estcov_subject_name, estcov_subject_sequence from estcoverage where estcov_source_id=? and estcov_source_start=? and estcov_align_type=?};
    $sth1 = $self->dbh->prepare($getcandncrna);
    $sth1->execute();
    $sth1->bind_columns(\$cid, \$cstart, \$cend, \$aligntype);
    $sth2 = $self->dbh->prepare($getclosedorgan);
    $sth3 = $self->dbh->prepare($getestcoverage);

    print "Create fasta file for each alignment of each candidate EST with the related homologous sequences.\n";

    # loop for each alignment of each candidate EST
    while( $sth1->fetch ) {
	next if index($aligntype, "blastn")>=0;

        # get candidate EST subsequence
	$csubseq = $self->get_assest_subsequence($cid, $cstart, $cend);

        # create one fasta file for each alignment of each candidate EST with covered mammalian subsequences
        push @fasta, "$cid.$cstart.$aligntype";

	open OUT, ">$subdirfasta/$cid.$cstart.$aligntype.fa" || die("Can not open the file!\n");
	print OUT ">$cid\n";
	print OUT "$csubseq\n";
	
	# add closed organism
	$sth2->execute($cid, $cstart);
	( $sname, $sseq ) = $sth2->fetchrow_array;
	while( $sth2->fetchrow_array ) {}
	@name = split '\|', $sname;
	$sname = $name[1].".".$name[2];
	print OUT ">$sname\n";
	print OUT "$sseq\n";
	
	# add all other covered mammalian subsequence
	$sth3->execute($cid, $cstart, $aligntype);
	$sth3->bind_columns(\$sname, \$sseq);
	while( $sth3->fetch ) {
	    @name = split '\|', $sname;
	    $sname = $name[1].".".$name[2];
	    print OUT ">$sname\n";
	    print OUT "$sseq\n";
	}
	
	close OUT;
    }	
    
    return \@fasta;
}


# first sequence index = 1
sub get_gapfree_indizes {
    my ($self, $id, $start, $end) = @_;
    my ($subseq, @subseq, $tmp, $strand, $nr, $size, $base);

    $subseq = $self->get_assest_subsequence($id, $start, $end);
    @subseq = split //,$subseq;
    if( $start>$end ) {
	$tmp = $start;
	$start = $end;
	$end = $tmp;
	$strand = "-";
    }
    else {
	$strand = "+";
    }
    $nr = $start-1;
    $size = $end-$start+1;
    foreach $base (@subseq) {
	$nr++;
	if( $start==$nr ) {
	    $start++ if $base eq "-";
	    $size-- if $base eq "-";
	}
	elsif( $start<$nr && $end>=$nr ) {
	    $size-- if $base eq "-";
	}
    }

    return [$start, $size, $strand];
}


# input  - start position (first index is 1),
#          end position
# output - new end position counting all gaps inside the subsequence
sub get_gapincluded_indizes {
    my ($self, $id, $start, $end) = @_;
    my ($subseq, @subseq, $nr, $base);
  
    $subseq = $self->get_assest_subsequence($id, $start, $end);
    @subseq = split //,$subseq;

    $nr = $start-1;
    foreach $base (@subseq) {
	$nr++;
	if( $end>=$nr ) {
	    $end++ if $base eq "-";
	}
    }
    
    return $end;
}


# builds for each EST in 'candncrna' a hash including its homologous sequences in 'estcoverage'
#
# returns a reference to a hash including 
#             as key "$cid.$cstart.$aligntype" of a ncRNA candidate and 
#             as value a reference to a hash about the aligned sequences including 
#                 as key the identifier and
#                 as value an anonymous array with startposition (first sequence index = 1), length without gaps, strand and size of entire source sequence ('0')
sub get_homologous {
    my ($self) = @_;
    my ($getcandncrna, $getclosedorgan, $getestcoverage, $sth1, $sth2, $sth3);
    my ($cid, $cstart, $cend, $aligntype, $sname, $sstart, $send, %homologous);
    my ($gapfree);
    
    # create maf-files each with $MAFLENGTH aln-files (use blastn indizes: starting with 1 and indizes every time from the positiv strand)
    $getcandncrna = q{select estcov_source_id, estcov_source_start, cand_end, estcov_align_type from estcoverage, candncrna where estcov_source_id=cand_id and estcov_source_start=cand_start group by estcov_source_id, estcov_source_start, estcov_align_type};
    $getclosedorgan = q{select estcov_subject_name, estcov_subject_start, estcov_subject_end from estcoverage where estcov_source_id=? and estcov_source_start=? and estcov_align_type like "%blastn%"};
    $getestcoverage = q{select estcov_subject_name, estcov_subject_start, estcov_subject_end from estcoverage where estcov_source_id=? and estcov_source_start=? and estcov_align_type=?};
    $sth1 = $self->dbh->prepare($getcandncrna);
    $sth1->execute();
    $sth1->bind_columns(\$cid, \$cstart, \$cend, \$aligntype);
    $sth2 = $self->dbh->prepare($getclosedorgan);
    $sth3 = $self->dbh->prepare($getestcoverage);

    # loop for each alignment of each candidate EST
    while( $sth1->fetch ) {
	next if index($aligntype, "blastn")>=0;
        my %aln;
	
        # candidate ncRNA
	$gapfree = $self->get_gapfree_indizes($cid, $cstart, $cend);	
	$aln{$cid} = [$$gapfree[0], $$gapfree[1], $$gapfree[2], 0];
	
	# closed organism
	$sth2->execute($cid, $cstart);
	( $sname, $sstart, $send ) = $sth2->fetchrow_array;
	while( $sth2->fetchrow_array ) {}
	$aln{$sname} = [$sstart, $send-$sstart+1, "+", 0] if $sstart<$send;
	$aln{$sname} = [$send, $sstart-$send+1, "-", 0] if $sstart>$send;
	
        # homologous organisms
        $sth3->execute($cid, $cstart, $aligntype);
        $sth3->bind_columns(\$sname, \$sstart, \$send);
        while( $sth3->fetch ) {
            $aln{$sname} = [$sstart, $send-$sstart+1, "+", 0] if $sstart<$send;
            $aln{$sname} = [$send, $sstart-$send+1, "-", 0] if $sstart>$send;
        }
        
        # add the key=value pair "$cid.$cstart.$aligntype"=%aln to the returned hash
        my $aln_hashref = \%aln;
        $homologous{"$cid.$cstart.$aligntype"} = $aln_hashref;
    }
my $n = keys %homologous;print "nr: $n\n";	
    return \%homologous;
}


# analyse the results and update the mySQL-db; 
# stores for each pair of EST-id and EST-startindex the conserved structure with the highest p-value  
#
# input are the relative path to the RNAz output file and 
#           the relative path to the rnazCluster.pl output file
#           the version of used organisms for RNAz
sub insert_rnazwindows {
    my ($self, $rnazoutfile, $clusteredoutfile, $version) = @_;
    my ($deletecandncrna, $getcandncrna, $deletelocus, $deletewindow, $newcandncrna, $newrnazwindows, $getalignversion, $getmaxalignid, $newalignversion, $sth1, $sth2, $sth3, $sth4, $sth5, $sth6, $sth7, $sth8, $sth9);
    my ($status, $id, $start, $seq, $str, %sstruct, $org, @line, $pval, $strand);
    my (%versali, @aliorg, %alivers, $aliorg, $tmp, $max, $vers);
    
    # prepare mySQL statement
    $deletecandncrna = q{delete from candncrna};
    $getcandncrna = q{select cand_pvalue from candncrna where cand_id=? and cand_start=?};
    $deletelocus = q{delete from candncrna where cand_id=? and cand_start=?};
    $deletewindow = q{delete from rnazwindows where rnaz_locus=?};
    $newcandncrna = q{insert into candncrna values (?,?,?,?,?,NULL,NULL,?)};
    $newrnazwindows = q{insert into rnazwindows values (?,?,?,?,?,?,?,?,?,?,?,?,?,?)};
    $getalignversion = q{select * from alignversion order by align_organism};
    $getmaxalignid = q{select max(align_id) from alignversion};
    $newalignversion = q{insert into alignversion values (?,?)};
    $sth1 = $self->dbh->prepare($deletecandncrna);
    $sth2 = $self->dbh->prepare($newcandncrna);
    $sth3 = $self->dbh->prepare($newrnazwindows);
    $sth4 = $self->dbh->prepare($getcandncrna);
    $sth5 = $self->dbh->prepare($deletelocus);
    $sth6 = $self->dbh->prepare($deletewindow);
    $sth7 = $self->dbh->prepare($getalignversion);
    $sth8 = $self->dbh->prepare($getmaxalignid);
    $sth9 = $self->dbh->prepare($newalignversion);

    # clean the 'candncrna'-table
    $sth1->execute();
    
    # fetch versions already exists in table alignversion and fill a hash
    $sth7->execute();
    $sth7->bind_columns(\$id, \$org);
    while( $sth7->fetch ) {
	$versali{$id} = ( defined $versali{$id} ) ? $versali{$id}.":".$org : $org;
    }
    foreach( keys %versali ) {
	$alivers{$versali{$_}} = $_;
    }
    # insert $version in table alignversion if not already exists
    # ("best alignments": this entry is the version of a process using all alignments in estcoverage and pre-processing them with rnazWindow)
    $sth8->execute();
    $max = ( $sth8->fetchrow_array )[0];
    $max = -1 unless defined $max; 
    $sth9->execute($max+1, $version) unless defined $alivers{$version};
    $alivers{$version} = $max+1;

    # fill a hash with conserved structure and sequence information and aligned organisms
    $status = 0;
    open IN, $rnazoutfile || die("Can not open the file!\n");
    while( <IN> ) {
	if( !$status && /^>/ ) {
	    $status = 1;
	    @line = split " ";
	    $line[0]=~s/>//;
	    $id = $line[0];
	    $start = $line[1];
	    $strand = $line[3];
	    #( $id, $start ) = /^>(\d+)\s(\d+)/;
	    $seq = <IN>;
	    chomp( $seq );
	}
	elsif( /^>consensus/ ) {
	    # get the version number of an alignment and create a new one if necessary
	    undef $aliorg;
            map { $aliorg = ( defined $aliorg ) ? $aliorg.":".$_ : $_ } sort @aliorg;
	    unless( defined $alivers{$aliorg} ) {

		# insert new version in alignversion with align_id = max(align_id)+1
		$sth8->execute();
		$max = ( $sth8->fetchrow_array )[0];
		$max = -1 unless defined $max; 
		map { $sth9->execute($max+1, $_ ) } @aliorg;
		$alivers{$aliorg} = $max+1;
		
	    }
	    $vers = $alivers{$aliorg};
            undef @aliorg;

	    # fill a hash with conserved structure and sequence information and aligned organisms
	    <IN>;
	    $str = <IN>;
	    $str = ( split " ", $str )[0];
	    $sstruct{"$id:$start:$strand"} = [$seq,$str,$vers,$org];
	    $status = 0;
	    undef $org;
	}
	elsif( $status && /^>/ ) {
	    @line = split " ", $_;
	    $line[0] =~ s/>gnl\|//;
	    $tmp = $line[0]."|".($line[1]+1)."|".$line[2]."|".$line[3];
            $org = ( defined $org ) ? $org.":".$tmp : $tmp;
            # add organism to aligned organism list
            $tmp = ( split '\|',$line[0] )[0];
            push @aliorg, lc($tmp);
	}
    }
    close IN;
    

    open IN, $clusteredoutfile || die("Can not open the file!\n");
    while( <IN> ) {
	@line = split " ", $_;
	if( @line==8 ) {
	    # extract number of locus
	    $line[0] =~ s/locus//;
	    # update the EST length considering gaps and count them
	    $line[3] = $self->get_gapincluded_indizes($line[1], $line[2]+1, $line[3]);
	    # add predicted RNA genes to the 'candncrna'-table if primary key doesn't already exist with higher p-value
	    $sth4->execute($line[1], $line[2]+1);
	    $pval = ( $sth4->fetchrow_array )[0];
	    if( $pval && $pval<$line[6]) {
		# delete old entry
		$sth5->execute($line[1], $line[2]+1);
		$sth6->execute($line[0]);
		$sth2->execute($line[1], $line[2]+1, $line[3], $line[6], $line[0], $alivers{$version});
	    }
	    elsif( !$pval ) {
		$sth2->execute($line[1], $line[2]+1, $line[3], $line[6], $line[0], $alivers{$version});
	    }
	}
	else {
	    # extract number of locus and window
	    $line[1] =~ s/locus//;
	    $line[0] =~ s/window//;
	    # update the EST length considering gaps and count them
	    $line[4] = $self->get_gapincluded_indizes($line[2], $line[3]+1, $line[4]);
	    # add the local conserved structures (windows) of the predicted RNA genes to the 'rnazwindows'-table
	    $sth3->execute($line[1], $line[0], $line[3]+1, $line[4], $line[5], $line[17], $line[14], $line[15], $line[13], $line[8], $sstruct{"$line[2]:$line[3]:$line[5]"}[0], $sstruct{"$line[2]:$line[3]:$line[5]"}[1], $sstruct{"$line[2]:$line[3]:$line[5]"}[2], $sstruct{"$line[2]:$line[3]:$line[5]"}[3]);
	}
    }
    close IN;
    
    $self->dbh->commit;
}


# updates the annotation of candncrna entries, if already an annotation exists then concatinate the new one seperated by ':'
#
# Input  - a reference to a hash including as key the ID (and the start position seperated by ":") and as value the annotation text
#  	   (query_start, query_end, subject_id, evalue, identity, subject_coverage and description seperated by ":")
# Output - the number of changes
sub insert_cand_annotation {
    my ($self, $anno_hashref) = @_;
    my ($sth1, $sth2, $sth3, $sth4, $id, $start, $oldanno, $tmp, $anno, @item);
    my $nr = 0;

    $sth1 = $self->dbh->prepare("select cand_annotation from candncrna where cand_id=?");
    $sth2 = $self->dbh->prepare("update candncrna set cand_annotation=? where cand_id=?");
    $sth3 = $self->dbh->prepare("select cand_annotation from candncrna where cand_id=? and cand_start=?");
    $sth4 = $self->dbh->prepare("update candncrna set cand_annotation=? where cand_id=? and cand_start=?");

    foreach( keys %$anno_hashref ) {
	@item = split ":", $$anno_hashref{$_};
        chomp $item[6];
	$anno = $item[6]." ($item[2])";

	if( index($_,":")>-1 ) {
		( $id, $start ) = split ":", $_;
		$sth3->execute($id, $start);
		$oldanno = ( $sth3->fetchrow_array )[0];
		$anno = $oldanno." : ".$anno if defined $oldanno;

		$sth4->execute($anno, $id, $start);
		$nr++;			
	}
	else {		
		$sth1->execute($_);
		$oldanno = ( $sth1->fetchrow_array )[0];
		$anno = $oldanno." : ".$anno if defined $oldanno;

		$sth2->execute($anno, $_);
		$nr++;
	}
    }

    return $nr;
}


sub insert_cand_utr {
    my ($self, $utr_hashref) = @_;
    my ($sth1, $sth2, $anno, $oldanno, $id, $start, $utr, @item);
    my $nr = 0;

    $sth1 = $self->dbh->prepare("update candncrna set cand_utr=? where cand_id=? and cand_start=?");
    $sth2 = $self->dbh->prepare("select cand_utr from candncrna where cand_id=? and cand_start=?");
    foreach( keys %$utr_hashref ) {
	$anno = $$utr_hashref{$_};
        ( $id, $start ) = split ":", $_;
	$sth2->execute($id, $start);
        $oldanno = ( $sth2->fetchrow_array )[0];
        $anno = $oldanno." : ".$anno if defined $oldanno;
        $sth1->execute($anno, $id, $start);
        $nr++;
    }

    return $nr;
}


# fill the annotatedncrna table
#
# Input  - a reference to a hash including as key the ID and as value the annotation
#          (query_start, query_end, subject_id, evalue, identity, subject_coverage and description seperated by ":")
# Output - the number of similar items in the ncRNA dbs for the ESTs

sub insert_annotatedncrna {
    my ($self, $anno_refhash) = @_;
    my ($sth, @item);
    my $nr = 0;

    $sth = $self->dbh->prepare("insert into annotatedncrna values (?,?,?,?,?,?,?,?)");
    foreach( keys %$anno_refhash ) {
	@item = split ":", $$anno_refhash{$_};
	$sth->execute($_, $item[0], $item[1], $item[2], $item[3], $item[4], $item[5], $item[6]);
	$nr++;
    }

    return $nr;
}


# updates the annotation of knownncrna entries
#
# Input  - a reference to a hash including as key the ID (id:start) and as value the annotation text
#	   (query_start, query_end, subject_id, evalue, identity, subject_coverage and description seperated by ":")
# Output - the number of changes
sub update_known_ncRNA {
    my ($self, $anno_refhash) = @_;
    my ($sth, $sth2, $fam, $anno, @item, $id, $start);
    my $nr = 0;

    $sth = $self->dbh->prepare("update knownncrna set known_annotation=? where known_id=? and known_start=?");
    $sth2 = $self->dbh->prepare("select known_annotation from knownncrna where known_id=? and known_start=?");
    foreach( keys %$anno_refhash ) {
	($id, $start) = split ":";
	@item = split ":", $$anno_refhash{$_};
	chomp $item[6];
	$anno = $item[6]." ($item[2])";
	$sth2->execute($id, $start);
	$fam = ( $sth2->fetchrow_array )[0];
	$anno = $fam." : ".$anno if defined $fam;
        $sth->execute($anno, $id, $start);
        $nr++;
    }

    return $nr;
}


# get filenames of clustalw alignments of candncrnas
sub cand_aln_files {
    my ($self) = @_;
    my ($query, $sth, $id, $start, $type, @files);

    $query = q{select cons_id, cons_start, estcov_align_type from conservedest, estcoverage where cons_id=estcov_source_id and cons_start=estcov_source_start and cons_id in (select cand_id from candncrna) and estcov_align_type like "%bosTau2%" group by cons_id, cons_start, estcov_align_type};
    
    $sth = $self->dbh->prepare($query);
    $sth->execute();
    $sth->bind_columns(\$id, \$start, \$type);
    while( $sth->fetch ) {
	# add file name to an array
	push @files, "$id.$start.$type.aln";
    }

    return \@files;
}


# fills the microRNA table with RNAmicro-results
# Input  - a reference to a hash including as key the ID and as value all hits seperated by ":" 
#          whereby each hit is presented by the start pos., end pos., strand, p-value seperated by space
# Output - the number ESTs predicted as microRNA
sub insert_microRNA {
    my ($self, $micro_hashref) = @_;
    my ($sth, $id, @cluster, @micro, $nr);
    
    $sth = $self->dbh->prepare("insert into microrna values (?,?,?,?,?)");

    foreach $id ( keys %$micro_hashref ) {
	@cluster = split ":", $$micro_hashref{$id};
	foreach( @cluster ) {
		@micro = split " ";
		$sth->execute($id, $micro[0], $micro[1], $micro[2], $micro[3]);
	}
	$nr++;
    }

    return $nr;
}


# fills the snoRNA table with snowRNA-results
# Input  - a reference to a hash including as key the ID and as value an anonymous hash with strand and p-value
# Output - the number ESTs predicted as snoRNA
sub insert_snoRNA {
    my ($self, $sno_hashref) = @_;
    my ($sth, $id, @cluster, @sno, $nr);
    
    $sth = $self->dbh->prepare("insert into snorna values (?,?,?)");

    foreach $id ( keys %$sno_hashref ) {
	$sth->execute($id, $$sno_hashref{$id}[1], $$sno_hashref{$id}[0]);
	$nr++;
    }

    return $nr;
}


# returns the number of ncRNA candidates
sub get_nr_candncrna {
    my ($self) = @_;
    my ($sth, $nr);
    
    $sth = $self->dbh->prepare("select count(*) from candncrna");
    $sth->execute();
    $nr = ( $sth->fetchrow_array )[0];
    while( $sth->fetchrow_array ) {}
    
    return $nr;
}


# get homologous sequences of an homologous organism to all candidates of ncRNAs
# Input  - the searched organism name (f.e. hg17)
# Output - a reference to a hash whereby the key describes the EST (id, start, end seperated by ":") and the value the chromosome, start pos., end pos. and strand of the homologous sequence seperated by ":"
sub cand_homologous_seq {
    my ($self, $org) = @_;
    my ($query, $sth1, $cid, $cstart, $cend, $align, $tmp, @hg, @org, %cchr, %cstart, %cend, %cstrand, %out);
    
    # get all candidates of ncRNAs and the homologous organism $org
    $query = q{select cand_id, cand_start, cand_end, rnaz_align_organisms_list from rnazwindows, candncrna where rnaz_locus=cand_locus and rnaz_pvalue>0.9 and rnaz_align_organisms_list like ?};
    $sth1 = $self->dbh->prepare($query);
    $sth1->execute("\%$org\%");
    $sth1->bind_columns(\$cid,\$cstart,\$cend,\$align);
    while($sth1->fetch) {
	@org = split ":", $align;
	map { $tmp=lc($_); @hg = split '\|' if $tmp=~/^$org/ } @org;
        if( $#hg > -1 ) {
        	$cchr{"$cid:$cstart:$cend"} = $hg[1] unless defined $cchr{"$cid:$cstart:$cend"};
                $cstart{"$cid:$cstart:$cend"} = $hg[2] unless defined $cstart{"$cid:$cstart:$cend"};
                $tmp = $hg[2]+$hg[3]-1;
                $cend{"$cid:$cstart:$cend"} = $tmp;
                $cstrand{"$cid:$cstart:$cend"} = $hg[4];
     	}
        @hg=();
    }

    # generate returned hash
    map { $out{$_} = $cchr{$_}.":".$cstart{$_}.":".$cend{$_}.":".$cstrand{$_} } keys %cchr;

    return \%out; 
}


# calculate the false positive rate of RNAz predictions for the given p-value (number of shuffled scanning windows/number of original scanning windows)
# Input  - (1) the clustered RNAz output file of the shuffled alignments
#          (2) the p-value
# Output - the false positive rate
sub RNAz_statistics {
    my ($self, $rnazrandomcluster, $p) = @_;
    my ($randnr, @line, $query, $sth, $orignr, $fprate);

    $randnr = 0;
    open IN, "$rnazrandomcluster" || die("Can not open the file!\n");
    while( <IN> ) {
	@line = split " ";
	$randnr++ if( index($line[0],"window")>=0 && $line[17]>$p ); 	
    }
    close IN;

    $query = q{select count(*) from rnazwindows where rnaz_pvalue>?};
    $sth = $self->dbh->prepare($query);
    $sth->execute($p);
    $orignr = ( $sth->fetchrow_array )[0];

    # calculate false positive rate
    $fprate = $randnr/$orignr;

    return $fprate;
}


# calculate the false positive rate of RNAmicro predictions for the given p-value (number of shuffled scanning windows/number of original scanning windows)
# Input  - (1) a reference to a hash including as key the ID and as value all hits seperated by ":" 
#          whereby each hit is presented by the start pos., end pos., strand, p-value seperated by space
#          (3) the p-value
# Output - the false positive rate
sub RNAmicro_statistics {
    my($self, $micro_refhash, $p) = @_;
    my ( $id, @cluster, @micro, $query, $sth, $orignr, $fprate);
    my $randnr = 0;

    # count the hits of the shuffled data with p-value higher as $p
    foreach $id ( keys %$micro_refhash ) {
    	@cluster = split ":", $$micro_refhash{$id};
        foreach( @cluster ) {
       		@micro = split " ";
                $randnr++ if $micro[3]>$p;
        }
    }

    # get hits of the original data with p-value higher as $p
    $query = q{select count(*) from microrna where micro_pvalue>?};
    $sth = $self->dbh->prepare($query);
    $sth->execute($p);
    $orignr = ( $sth->fetchrow_array )[0];

    # calculate false positive rate
    $fprate = $randnr/$orignr;

    return $fprate;
}


# calculate the false positive rate of snoRNA predictions for the given p-value (number of shuffled scanning windows/number of original scanning windows)
# Input  - (1) a reference to a hash including as key the ID and as value an anonymous hash with strand and p-value
#          (3) the p-value
# Output - the false positive rate
sub get_snoRNA_statistics {
    my($self, $sno_refhash, $p) = @_;
    my($id, $query, $sth, $orignr, $fprate);
    my $randnr = 0;

    # get hits of the shuffled data
    foreach $id ( keys %$sno_refhash ) {
	$randnr++ if $$sno_refhash{$id}[0]>$p;	
    }

    # get hits of the original data
    $query = q{select count(*) from snorna where sno_pvalue>?};
    $sth = $self->dbh->prepare($query);
    $sth->execute($p);
    $orignr = ( $sth->fetchrow_array )[0];

    # calculate false positive rate
    $fprate = $randnr/$orignr;

    return $fprate;
}


# fills the ncbiannotation table with blastn hits of candidate ncRNAs to the NCBI databases
# Input  - a reference to a hash including as key the EST id, EST start, subject id seperated by ":" and as value query start, query end, e-value, identity (identity/align_length), EST coverage (align_length/query_length) and description also seperated by ":"
# Output - the number of new database entries
sub insert_ncbi_annotation {
    my ($self, $anno_hashref) = @_;
    my ($sth, $id, @key, @val, $nr);

    $sth = $self->dbh->prepare("insert into ncbiannotation values (?,?,?,?,?,?,?,?,?)");

    foreach $id ( keys %$anno_hashref ) {
	@key = split ":", $id;
	@val = split ":", $$anno_hashref{$id};

        $sth->execute($key[0], $key[1], $val[0], $val[1], $key[2], $val[2], $val[3], $val[4], $val[5]);
     
        $nr++;
    }

    return $nr;
}
1;

__END__
# Below is stub documentation for your module. You'd better edit it!

=head1 NAME

EST2ncRNA::MysqlInterface - Perl extension for blah blah blah

=head1 SYNOPSIS

  use EST2ncRNA::MysqlInterface;
  blah blah blah

=head1 DESCRIPTION

Stub documentation for EST2ncRNA::MysqlInterface, created by h2xs. It looks like the
author of the extension was negligent enough to leave the stub
unedited.

Blah blah blah.

=head2 EXPORT

None by default.



=head1 SEE ALSO

Mention other useful documentation such as the documentation of
related modules or operating system documentation (such as man pages
in UNIX), or any relevant external documentation such as RFCs or
standards.

If you have a mailing list set up for your module, mention it here.

If you have a web site set up for your module, mention it here.

=head1 AUTHOR

Stefan Seemann, E<lt>seemann@bioinf.uni-leipzig.deE<gt>

=head1 COPYRIGHT AND LICENSE

Copyright (C) 2006 by Stefan Seemann

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself, either Perl version 5.8.6 or,
at your option, any later version of Perl 5 you may have available.


=cut
