#!/usr/bin/env perl   

#  -*- perl -*-

#use strict;
#use warnings FATAL => qw ( all );
use Data::Dumper;

# COLS2RDC: 
# Ebbe Sloth Andersen, August 2005.

# >>>>>>>>>>>>>>>>>> SPLIT RDC <<<<<<<<<<<<<<<<<<<<<<<<<

my ( @entries, $file );

( $file ) = @ARGV;

@entries = &read_cols ( $file );

#print Dumper ( @entries );
#exit;

&write_rdc ( @entries );

# >>>>>>>>>>>>>>>>>> SUBROUTINES <<<<<<<<<<<<<<<<<<<<<<<

sub read_cols
{
    # READ_COLS v1.1: Reads column format into data structure with an array of hashes.
    # Reads any number of columns with any kind of data.
    # Ebbe Sloth Andersen, December 2004.
   
    my ( $fname ) = @_;

    # Returns an array of hashes. The key corresponds to the header.
    # The value contains body data points seperated by comma.

    my ( $line,        # input lines
	 @entries,     # the output array of hashes
	 @cols, 
	 $TYPE,        
	 @COL,         # any number of columns
	 $ENTRY, 
	 @data, 
	 $i,           # counting numbers
         $n,
	 $length,
	 $counter,
         $filename,
         $start,
         $end,
         );

    # initialize
    $counter = 0;	    
    @data = ( );
    @entries = ( );
    @COL = ( );

    $start = 1;
    $end = 500;

    for ( $n = 0; $n < 300; $n++ ) {
        $filename = $fname . "_" . $start . "_" . $end . "_struc.col";
        if ( not open FILE, "< $filename" ) {
        #    die "file not found!";
        }
        while ( $line = <FILE> )
        {
            if ( $line =~ /^; \*+/ )            # finds footer
            {
                if ( $TYPE eq "TREE" ) {   # der er et trae i hver fil
                    # reset all
                    @data = ( );
                    @COL = ( );
                } else {
                    push @entries, 
                    {
                        "TYPE" => $TYPE,
                        "ENTRY" => $ENTRY,
                        "ALIENTRY" => $filename,
                        "ALITYPE" => "nucleotide",
                    };                    
                    for ( $i=0; $i<$length; $i++ ) {
                        map { $entries[$counter]->{$COL[$i]} = $data[$i] } @entries;
                    }
                    $counter++;
                    @data = ( );
                    @COL = ( );
                }
            }
            elsif ( $line =~ /^;/ )            # reads header
            {
                if ( $line =~ /^; TYPE\s+(\S+)/ ) {
                    $TYPE = $1;
                } elsif ( $line =~ /^; COL\s+\S+\s+(\S+)/ ) {
                    push ( @COL, $1 );
                } elsif ( $line =~ /^; ENTRY\s+(\S+)/ ) { 
                    $ENTRY = $1;
                }
            }
            else                               # reads body
            {
                @cols = split(/\s+/, $line);
                $length = scalar @cols;
                # add sequences on an array
                for ( $i=0; $i<$length; $i++ ) {
                    $data[$i] .= "$cols[$i],";
                }
            }   

        } 
        close FILE;
        @cols = ( );
        @data = ( );
        @COL = ( );
        $TYPE = "";
        $ENTRY = "";
        $filename = "";
        $start = $start + 250;
        $end = $end + 250;
    }

#    print Dumper ( @entries );
#    exit;


    return wantarray ? @entries : \@entries;
}

sub write_rdc
{
    # WRITE_RDC: outputs RNA-decoder file.
    # Ebbe Sloth Andersen, December 2004.

    my ( @entries ) = @_;

    # Returns nothing.

    my ( $entry,    # runs through entries
	 $count,    # counting numbers
	 $length, 
	 $i,
         $n,
         $a,
         @ALIENTRY,
         $ALIENTRY,
         $test,
         @AE,
         $AE,
         @certainty,
         @align_bp,
         );

    # print header
    print "; This file was generated by cols2rdc\n";
    print "; ==========\n";

    # get array of aligenment entries
    @ALIENTRY = ( );
    $test = "first";
    foreach $entry ( @entries ) {
        if ( $entry->{'ALIENTRY'} ne $test ) {
            push ( @ALIENTRY, $entry->{'ALIENTRY'} );
            $test = $entry->{'ALIENTRY'};
        }
    }
    foreach $ALIENTRY ( @ALIENTRY ) {
        @AE = ( );
        @AE = grep { $_->{'ALIENTRY'} eq $ALIENTRY } @entries;    

#        print Dumper ( @AE );
#        exit;

        print "; ENTRY\t$AE[0]->{'ALIENTRY'}\n";
        print "; TYPE\t$AE[0]->{'ALITYPE'}\n";
        $count = 1;
        print "; COL " . $count . "\tcodonmask\n";
        $count++;   
        foreach $entry ( @AE ) {
            print "; COL " . $count . "\tsymbols taxa=\"" . substr $entry->{'ENTRY'}, 0, 30; print "\"\n";
            $count++;
        }
        print "; COL " . $count . "\tpairedPosition\n";
        $count++;
        print "; COL " . $count . "\tpositions\n";  # should be big alignment positions, but I don't have them
        $count++;
        print "; COL " . $count . "\tposteriorProbabilities\n";   # same as certainty 
        $count++;
        print "; COL " . $count . "\tlabels type=\"stateAnno\"\n"; 
        print "; ----------\n";
        
        # print columns
        $entry = $AE[1];
        $length = length $entry->{'residue'};   # length includes both symbol and comma
        @certainty = ( );
        @certainty = split ( /,/, $entry->{'certainty'} );
        @align_bp = ( );
        @align_bp = split ( /,/, $entry->{'align_bp'} );

#        print Dumper ( @entries );
#        exit;

        $n = 0;
        for ($i = 0; $i < $length; $i++ ) {
            # print paired positions
            print "3\t";
            foreach $entry ( @AE ) {
                print uc(substr $entry->{'residue'}, $i, 1); 
                print "\t";
            }
            $i++;
            # print paired positions
            print "$align_bp[$n]\t";
            # print positions
            $a++;
            print "$a\t";
            # print posterior probabilities
            print "$certainty[$n]\t";
            # print state anno
            print "$align_bp[$n]\n";
            $n++;
        }
        $a = $a - 250;
        # print footer
        print "; **********\n";
    }
}

