# This folder contains dataset used for the benchmarking of RNAbound. 
# If you have any quires please contact sabari@rth.dk, seemann@rth.dk


# I) Data freeze
# ***************
# The final benchmark dataset used generated table and figures are given below. Further details about their source and pre/post-processing steps can be found in the below section II.

# filtered annotations (corresponding to Table 1)
dataset/windows/{window_100_benchmark_data.bed, window_150_benchmark_data.bed, window_200_benchmark_data.bed}
# corresponding multiple sequence alignments
dataset/alignment/{out_100_nogaps_0.75.maf, out_150_nogaps_0.75.maf, out_200_nogaps_0.75.maf}
# Results of RNAbound predictions (with default parameters flank 10 and pnull 0.0005) with PETfold and RNAfold base pair probabilities  
results/{petfold_rnabound_window_100_10_0.0005, petfold_rnabound_window_150_10_0.0005, petfold_rnabound_window_200_10_0.0005}
results/{rnafold_rnabound_window_100_10_0.0005, rnafold_rnabound_window_150_10_0.0005, rnafold_rnabound_window_200_10_0.0005}
# top hits of RNAbound and Dotu et al., approaches (used for comparison in Figure 1,2; Table 2, A1, A2) 
results/summary/{rnabound_dotu_boundaries_multipleseq_f10_p0.0005.tsv, rnabound_dotu_boundaries_singleseq_f10_p0.0005.tsv}

#  II) source information
#  **********************
# ======================================================================
# A) Input dataset (structured RNA annotations/multiple sequence alignment)
# ======================================================================
# a) Annotations:
# --------------
# Known structured RNA sequences from various resources such as Rfam (v12.2), mirBase (v21), tRNAdb[18], rRNA (silva)[19] and snoRNAdb[20] were mapped into the human genome (hg38). In case of sequences from Rfam seed alignments, we used BLAST tool to find 100% identical match in the human genome.
dataset/annotations/all_filtered_annotations.bed.gz

# b) Alignment:
# ------------
# Multiple sequence alignments were generated for each annotated region by the following steps:
# 1. extract the maf blocks by overlapping the annotation with each pairwise ucsc alignment between human and species X
# 2. subset the pairwise mafs by requiring at least 10 flanking nucleotides around the annotation (full length maf kept, not cut to match requirement)
# 3. further subset the pairwise mafs by requiring at least 100 nucleotides on each side of the center of the annotation (full length maf kept, not cut to match requirement)
# 4. build a multiple alignment from the pairwise alignments obtained in 3. with the roast command from tba (the threaded block aligner used in multiz)
# roast + R=30 M=1 E=hg38_primary '((((hg38_primary tupChi1) (((speTri2 (jacJac1 (micOch1 mm10))) (hetGla2 chiLan1)) oryCun2)) (orcOrc1 (felCat8 pteAle1))) (triMan1 dasNov3))' hg38_primary.*sing.maf 14way.maf
dataset/alignment/14way.maf.gz

# c) structure details
# --------------------
# consensus structure from Rfam seed alignments have been used to generate the rnashapes (using RNAshape program)
# $ create env create -f scripts/environment.yml
# $ source activate rnabound
# $ perl scripts/extract_consensus_structure.pl dataset/annotations/Rfam12.2.seed.gz >dataset/annotations/structure_percentage_rnashape.txt
dataset/annotations/structure_percentage_rnashape.txt

# d) non-overlapping annotations
# ------------------------------
# list of non overlapping annotations within 200bp upstream or downstream of other annotations
# bedtools intersect -a <(zgrep ^chr dataset/annotations/all_filtered_annotations.bed.gz) -b <(zgrep ^chr dataset/annotations/all_filtered_annotations.bed.gz) -c | awk '$7==1' >/tmp/nonoverlaps;bedtools closest -a /tmp/nonoverlaps -b /tmp/nonoverlaps -io -d | awk '$15>200' >dataset/annotations/nonoverlapping_annotation_dist200.bed
structure_percentage_rnashape.txt


# ========================================================================================================
# B) Benchmarking of RNAbound and Dotu  et al., approaches on multiple sequence alignment and single sequence
# ========================================================================================================
# create window of difference sizes (100, 150, 200) around the center of the mapped structure
scripts/extract_mafs_windows.sh 

# the quality filtering steps and the post-processing/quality filtering of alignments were carried out using the below python notebook
# (before run the following python script do "source activate rnabound". If the environment is not created before, then create one as "conda create -f rnabound_env.yml"
# and notebook has to be run from scripts/ folder
scripts/quality_control_mafs.ipynb

# Run RNAbound and Dotu et al., approaches on the multiple sequence alignment and single sequence
# PLEASE NOTE that the following program requires PETfold and RNAfold program installed locally. And, their corresponding path has to be set in the below script.
for win in 100 150 200; do while read line;do scripts/run_petfold_rnabound.sh ${arr[0]} ${arr[1]} ${arr[2]} ${arr[7]} $win 10 0.0005;done <<< "`grep -v window_start dataset/windows/window_${win}_benchmark_data.bed`";done

# get the top hit for RNAbound and Dotu et al., approaches 
# the following script requires the activation of rnabound environment, if it is not installed then run the below command
# $ create env create -f scripts/environment.yml
source activate rnabound
# for multiple sequence alignment
python scripts/get_benchmarking_hits_multipleseq.py
# for single sequence
python scripts/get_benchmarking_hits_singleseq.py

# parameter testing (combination of different flank size and base pair threshold)
python scripts/parameter_testing.py

# ==================================
# C) Prepare figures for the manuscript
# ==================================

# Figure 1,2 - comparison of RNAbound and Dotu et al., predictions for single and multiple sequence alignment
scripts/Figures1_2_multipleseq_and_singleseq_boxplot.ipynb

# Figure S1 - parameter combinations for RNAbound fitness function
scripts/FigureS1_parameter_testing.ipynb

# Figure 3-4 dot plots were generated using "drawdot" function from PETfold together with the PETfold reliability matrix
# e.g,. $ PETfold/bin/drawdot results/petfold_rnabound_window_200_10_0.0005/chr20_38429635_38429836_pp.txt >chr20_38429635_38429836_pp.ps
