#! /bin/tcsh -f
#
# Use this script to cluster RNA sequences into groups of
# different secondary structure using Pfold:
#
#   - First argument is the fasta file to analyze
#   - Optional second argument is the position range
#
# Output is a number of files:
#
#   - "XXX_groups_???.txt" has all groupings from individual sequences to
#     one large group. Each splitting takes up one file, with a group
#     on each line with the score next to it. '???' is the number of
#     groups in the file.
#
#   - "XXX_groups_max.txt" has the grouping with the highest score.
#
#   - "XXX_groups_best.txt" has a grouping with a slightly lower score, but
#     less groups.
#
#   - "XXX_score.dat" is a datafile for plotting of scores as a function
#     of number of groups
#
# How it works:
#
#   A 'state' is a splitting of the sequences. We start with all
#   individual sequences in their own group.
#
#   A 'move' is a joining of two groups. The algorithm always does the
#   move that increases the score the most.
#

set PCLUSTER = "${SARSE_HOME}/programs/pcluster/bin"
setenv LANG en_US
# Base file name
if ($2 == "") then
  set BASE = "$1:r"
else
  set BASE = "$1:r_$2"
endif

# Build initial state file
echo ""
echo "Making initial state file"
cat /dev/null > tmp_state.$$.txt
foreach range ( `cat $1 | grep "^>" | awk '{print NR}'` )
  echo -n $range ""
  $PCLUSTER/runscfg.tcsh $1 $range $2 >> tmp_state.$$.txt
end
echo ""

# Build initial moves file
echo ""
echo "Making initial moves file"
cat /dev/null > tmp_moves.$$.txt
foreach range ( `cat $1 | grep "^>" | awk 'END {for (i = 1; i <= NR ; i++){for (j = i+1; j <= NR ; j++) {print i "," j}}}'` )
  echo -n $range ""
  $PCLUSTER/runscfg.tcsh $1 $range $2 >> tmp_moves.$$.txt
end
echo ""

# Save the inital state file:
set cnt = `awk 'END {printf "%-3.3d", NR}' tmp_state.$$.txt`
cp tmp_state.$$.txt "$BASE"_groups_$cnt.txt

# Find score by multiplying scores for a group with the number of
# sequences in the group. Add them all to get the total score.
set score_curr = `awk '$1 != ";" {gsub("[^,]","",$1); s += $2*(length($1)+1)} END {print s}' tmp_state.$$.txt`

echo "Current score: " $score_curr
echo $cnt $score_curr > "$BASE"_score.dat

# The core iteration of the algorithm
echo "Starting iteration"

label_iter:

$PCLUSTER/findstate tmp_state.$$.txt tmp_moves.$$.txt > tmp_state_best.$$.txt

cp tmp_state_best.$$.txt tmp_state.$$.txt

set best_move = `head -1 tmp_state.$$.txt | awk '{print $1}'`

echo ""
echo "New group:" $best_move

# Save the new state file:
set cnt = `awk 'END {printf "%-3.3d", NR}' tmp_state.$$.txt`
cp tmp_state.$$.txt "$BASE"_groups_$cnt.txt

# Find score by multiplying scores for a group with the number of
# sequences in the group. Add them all to get the total score.
set score_curr = `awk '$1 != ";" {gsub("[^,]","",$1); s += $2*(length($1)+1)} END {print s}' tmp_state.$$.txt`

echo "Current score: " $score_curr
echo $cnt $score_curr >> "$BASE"_score.dat

# Stop when only one group is left
if ($cnt == "001") goto label_done

# Make new moves file

# Remove obsolete moves
$PCLUSTER/findmoves tmp_state_best.$$.txt tmp_moves.$$.txt > tmp_moves_best.$$.txt

cp tmp_moves_best.$$.txt tmp_moves.$$.txt

# Add new moves
foreach join (`awk 'NR > 1 {print "'"$best_move"',"$1}' tmp_state.$$.txt`)
  echo "Calculating the move:" $join
  $PCLUSTER/runscfg.tcsh $1 $join $2 >> tmp_moves.$$.txt
end

goto label_iter

label_done:

# Clean up
rm tmp_moves.$$.txt
rm tmp_moves_best.$$.txt
rm tmp_state.$$.txt
rm tmp_state_best.$$.txt

# Pick best group files
set max_group = `$PCLUSTER/findbest "$BASE"_score.dat 0`
echo ""
echo "Maximum scoring group:" $max_group
cp "$BASE"_groups_$max_group.txt "$BASE"_groups_max.txt 

set best_group = `$PCLUSTER/findbest "$BASE"_score.dat`
echo ""
echo '"Best" group:' $best_group
cp "$BASE"_groups_$best_group.txt "$BASE"_groups_best.txt 

