#!/usr/bin/env bash

##########################################
# motif length parameters
##########################################
# oligo seed length
# allowable values: 5,6,7,8,9,10
len=7
# max motif length
#min_motif_length=$(( $len - 0 ))
min_motif_length=4
# min motif length
#max_motif_length=$(( $len + 10 ))
max_motif_length=10
##########################################
# motif length parameters
##########################################

##########################################
# clustering top seeds parameters
##########################################
# for aligning motifs during clustering (require at least 'required_core_length' overlap)
required_core_length=6                        # default used in cERMIT paper for chip-seq data is 5
# required sequence similarity to assign to the same cluster (range: [0.0,1.0], identical <=> 1.0)
cluster_sim_threshold=0.85                    # default used in cERMIT paper for chip-seq data is 0.75
# for aligning motifs during clustering (require hypergeometric p-value for co-occurrence <= 'hypegeom_p_val_cutoff')
hypegeom_p_val_cutoff=1.0e-30	    	      # default used in cERMIT paper for chip-seq data is 1.0e-30
##########################################
# clustering top seeds parameters
##########################################

##########################################
# motif degeneracy parameters
##########################################
# allowable 'degeneracy score' (range [0.0, 1.0] 0.0 => no degeneracy, 1.0 => all 'N's)
degen_threshold=0.5                           # default used in cERMIT paper for chip-seq data is 0.75
# allowable percentage of degenerate bases in a motif
fraction_degen_pos_threshold=0.5              # default used in cERMIT paper for chip-seq data is 0.75
##########################################
# motif degeneracy parameters
##########################################

##########################################
# p-value estimation parameters
##########################################
# random scores output file
random_runs_filename="random_scores"
# p-value estimate based on random scores
significance_estimates_file="significance_estimates"
# number of random scores generated based on randomly assigned evidence values
# (used for p-value estimate: 'significance_estimates_file' file), set to 0 to skip p-value estimation
num_random_runs=0
##########################################
# p-value estimation parameters
##########################################

##########################################
# regression estimation parameters
##########################################
# 'yes' -> regression scoring of target set
# 'no'  -> scoring of target set as described in the cERMIT paper
use_regression_scoring=no
# input covariates file
covariates_file_list=covariates_file_list
##########################################
# regression estimation parameters
##########################################

##########################################
# bootstrap motif score parameters
##########################################
# use a bootstrap filter when scoring a motif based on resampling with replacement of the set of target sequence regions
bootstrap_motif_score=no
num_bootstrap_draws=20
# a motif to be considered significant if 
# (bootstrap_mean - bootstrap_Std_threshold*bootstrap_sd) > 0
bootstrap_Std_threshold=3.0
##########################################
# bootstrap motif score parameters
##########################################

##########################################
# PSSM parameters
##########################################
# crop all flanking positions in the pssm that have <= pssm_crop_threshold * (# reads of "core")
pssm_crop_threshold=0.33
# consider all cluster members that have score >= cluster_pssm_threshold_fraction * top_cluster_score
cluster_pssm_threshold_fraction=0.5
# fraction of top sequences to use in the construction of the PSSM
fraction_of_top_sequences_to_consider=0.5
##########################################
# PSSM parameters
##########################################

##########################################
# target set size parameters
##########################################
#--------------------
# max target set size = max_gene_set_size_percentage_threshold * total_num_seq_regions)
#--------------------
# maximum target size (as a percentage of total number of input sequence regions)
max_gene_set_size_percentage_threshold=0.99;     # default used in cERMIT paper for chip-seq data is 0.95
#--------------------
# min target set size = MIN(min_gene_set_size_absolute_threshold, min_gene_set_size_percentage_threshold * total_num_seq_regions)
#--------------------
# minimum geneset size (as an absolute number)
min_gene_set_size_absolute_threshold=20;	 # default used in cERMIT paper for chip-seq data is 20 for chip-chip, 100 for chip-seq
# minimum geneset size (as a percentage of total # input sequence regions)
min_gene_set_size_percentage_threshold=0.05;     # default used in cERMIT paper for chip-seq data is 0.01
##########################################
# target set size parameters
##########################################


##########################################
# dataset-specific parameters
##########################################
# 'yes' -> consider reverse the complement as different motif (appropriate for RNA-binding analyses)
# 'no'  -> consider reverse the complement as the same motif (appropriate for DNA-binding analyses)
strand_specific_analysis=yes
# supported options: 'human', 'yeast', 'arabidopsis'; any other value will result in uniform background frequencies 
species=human
##########################################
# dataset-specific parameters
##########################################

##########################################
# I/O & run parameters
##########################################
out_dir=out"$len"mers

# file path for starting oligo-mer seeds (allows for degenerate motifs over the IUPAC alphabet) default: non-degenerate 5,6,7,8,9,10-mers 
# NB: assumed to be in the parent relative to the currently active directory
in_seeds_file=./oligos_size_$len

# binary executable
# NB: assumed to be in the parent relative to the currently active directory
executable=./cERMIT

# file containing information on the input binding evidence & known PSSM if available
# NB: the binding evidence is expected to be ordered from LARGE to SMALL values, large 
evidence_file_list=evidence_file_list

# file containing information on the input sequence
sequence_file_list=sequence_file_list

# number of top cluster motif target set (match occurrences) to be output
num_clusters_to_output_motif_occurrences_for=5;

# number of top motif cluster PSSM to be output
num_motif_clusters_to_output=5;

# fast mode option:
# yes -> evolve only "promising" motif seeds (that have positive scores)
# no -> evolve all motif seeds (slower option)
fast_mode=no

##########################################
# I/O & run parameters
##########################################

##########################################
# DO NOT MODIFY BEYOND THIS POINT !!!
##########################################
echo "output directory: $out_dir"

$executable $evidence_file_list $sequence_file_list $out_dir $in_seeds_file $min_motif_length $max_motif_length $required_core_length $pssm_crop_threshold $cluster_pssm_threshold_fraction $cluster_sim_threshold $fraction_of_top_sequences_to_consider $hypegeom_p_val_cutoff $num_motif_clusters_to_output $degen_threshold $fraction_degen_pos_threshold $max_gene_set_size_percentage_threshold $min_gene_set_size_absolute_threshold $min_gene_set_size_percentage_threshold $num_clusters_to_output_motif_occurrences_for $species $random_runs_filename $num_random_runs $significance_estimates_file $strand_specific_analysis $bootstrap_Std_threshold $bootstrap_motif_score $num_bootstrap_draws $use_regression_scoring $covariates_file_list $fast_mode
