#!/usr/bin/perl -w

# Computes the K-category correlation coefficient.
# Copyright: Jan Gorodkin, gorodkin@bioinf.kvl.dk
# This software is distributed with a 
# GNU GENERAL PUBLIC LICENSE, see http://www.gnu.org/licenses/gpl.txt

# Copyright (C) 2004  Jan Gorodkin
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.

# For publication of results, please cite:
#  Comparing two K-category assignments by a K-category correlation coefficient.
#  J. Gorodkin, Computational Biology and Chemistry, 28:367-374, 2004.




# init settings:
# --------------
use Getopt::Long;

my $scriptname      = (split(/\//,$0))[-1];

# read options:
# -------------
die "No such option\n"
  unless(GetOptions("help|h"));

# check if help menu should be displayed:
# ---------------------------------------
&usage if ($opt_help || $opt_h);


# misc set up:
# ------------
my $n=0;
my $K=0;
my $prodXY=0;
my $prodYY=0;
my $prodXX=0;


# read data:
# ----------

  while(<>) {
    chomp;
    @F=split(/\s+/," $_");
    if($#F>1){
     if($n==0) { # process first line
       if($#F%2!=0){ 
          print STDERR "Odd number of fields. -$_- Bail out";
          exit();
       }
       else {
         $K=$#F/2;   # set the first K. Below only accept same size K's!
         for($k=1;$k<=$K;$k++) { $w[$k]=1; } # for now fix w_K=1
       }
     }
     if($#F/2 != $K){
       print STDERR "WARNING fields don't match first K. -$_- Skipping.\n";
     }
     else {
      for($k=1;$k<=$K;$k++) {  # get data and compute relevant terms on the fly
         $Ynk=$F[$k];
         $Xnk=$F[$k+$K];
         $aveY_K[$k]+=$Ynk;
         $aveX_K[$k]+=$Xnk;
         $prodXY+=$w[$k]*$Xnk*$Ynk;
         $prodYY+=$w[$k]*$Ynk*$Ynk;
         $prodXX+=$w[$k]*$Xnk*$Xnk;
      }
      $n++;
     }
   }
  }  # end while looping of data


# process data:
# -------------
my $N=$n;

  # covariance of YY
  $sum_aveY=0; for($k=1;$k<=$K;$k++) {
     $sum_aveY+=$w[$k]*$aveY_K[$k]*$aveY_K[$k];
  }
  $COV_YY=$prodYY - $sum_aveY/$N;
  # NOTE! I have $sum_aveY/$N rather than $sum_aveY * $N as
  # $sum_aveY never was normalized with $N!! same for
  # $sum_aveX and $sum_aveXY below!

  # covariance of XX
  $sum_aveX=0; for($k=1;$k<=$K;$k++) {
     $sum_aveX+=$w[$k]*$aveX_K[$k]*$aveX_K[$k];
  }
  $COV_XX=$prodXX - $sum_aveX/$N;

  $denominator = $COV_XX*$COV_YY;
  
  if($denominator == 0 ) {
     print STDERR "Denominator is zero. Bail out.\n";
     exit();
  }

  # covariance of XY
  $sum_aveXY=0; for($k=1;$k<=$K;$k++) {
     $sum_aveXY+=$w[$k]*$aveX_K[$k]*$aveY_K[$k];
  }
  $COV_XY=$prodXY - $sum_aveXY/$N;

  $RK=$COV_XY/sqrt($denominator);
  
  $b=$COV_XY/$COV_XX;
  $bprime=$COV_XY/$COV_YY;
  #$test = sqrt($b * $bprime);

  print "$RK     $b     $bprime    $COV_XY  $COV_XX   $COV_YY\n";



################################################################################
sub usage{

print<<USAGE_MSG;

Usage:
------
  $scriptname <file>

Description:
------------
  $scriptname compares two NxK tables X and Y of data and the content of
  <file>  should be these tables organized row by row as
               Y_n1, Y_n2, ... , Y_nK   X_n1, X_n2, ... , X_nK 

  In computing R_K w_K = 1/K for all K.

  The output is
      R_K   b    b'   COV(X,Y)   COV(X,X)   COV(Y,Y)
  See the reference for details.

  Related program is rkorrC which computes the dicrete version of R_K
  from the KxK confusion matrix.
  
Options:
--------
  -h|--help               Show this message.

Reference:
----------
  Comparing two K-category assignments by a K-category correlation coefficient.
  J. Gorodkin, Computational Biology and Chemistry, 28:367-374, 2004.


Author:
-------
Jan Gorodkin, gorodkin\@bioinf.kvl.dk.

USAGE_MSG
    
    exit (1);
} # end usage

################################################################################
$opt_help="";
$opt_h="";


################################## end of file #################################

