#!/bin/bash

#    SeqTrimMap version 1.0, 01-06-2012
#    Copyright 2012, Antonio Marco
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, version 3 of the License.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    A copy of the GNU General Public License in included along with
#    this program.



print_header(){
	echo ''
	echo ' =================================================================== '
	echo ' =                          SeqTrimMap 1.0                         = '
	echo ' =             Sequential trimming and mapping pipeline            = '
	echo ' =                    Antonio Marco (c) 2011, 2012                 = '
	echo ' =-----------------------------------------------------------------= '
	echo ' = Suggested citation:                                             = '
	echo ' =  Sequence reads were mapped with Bowtie (Langmead et al. 2009), = '
	echo ' =   using the sequential trimming strategy described in           = '
	echo ' =   Marco and Griffiths-Jones (2012)                              = '
	echo ' =-----------------------------------------------------------------= '
	echo ' = References:                                                     = '
	echo ' =   - Langmead (2009) Genome Biology 10:R25                       = '
	echo ' =   - Marco and Griffiths-Jones (2012) Bioinformatics 28:318      = '
	echo ' =================================================================== '
	echo ' '
}

# PATHS
#BOWTIE_PATH=/data/results/tools/align/bowtie-0.12.9/bowtie
#BOWTIE_PATH=/data/results/tools/align/bowtie-0.12.9
BOWTIE_PATH=/data/results/tools/align/bowtie-1.2.1.1
#TEMP_PATH=/data/results/fleming/run4/uniq_all/FinalPARCLIP/pvalues/mRNA/0hrep1/tmp
#TEMP_PATH=/data/images/proton/DKlab/mr/Polysomes/reads/seqtrimmap/tmp
TEMP_PATH=/tmp2/pc2
#TEMP_PATH=.

# READ ARGUMENTS
args=`getopt zbiskScp:m:v:o:f:l:x:u:t: $*`

# PRINT HELP
if [ $# -eq 0 ] ; then
	clear
	print_header
	echo 'USAGE: ./SeqTrimMap [OPTIONS] <INPUT_FASTA> <GENOME_FASTA>'
	echo
	echo 'Options:'
	echo '  -z		Gzip output files'
	echo '  -b		Input file in base-space'
	echo '  -S		Output file in SAM format'
	echo '  -c		Output sequence in color-space'
	echo '  -i		Reference genome is already in bowtie index format'
	echo '  -k		Use ''--col-keepends'' native Bowtie option'
	echo '  -p INT	Number of processors'
	echo '  -m INT	Save appart reads mapping to more than INT positions'
	echo '  -v INT	Mismatches allowed'
	echo '  -o <FILENAME>	Base name for output files'
	echo '  -f <FILENAME>	Filter reads mapping to the specified file (FASTA)'
	echo '  -l INT	Minimum read length after trimming to be mapped'
	echo '  -x INT	Maximum read length before trimming'
	echo '  -s		Create output file with reads mapped from length -u to -t'
	echo '    -u INT	(see -s option)'
	echo '    -t INT	(see -s option)'
	echo
	exit 1 
fi

# DEFAULT VALUES
zflag="no" # Gzip output files
bflag="no" # Input file in base-space
Sflag="no" # Output file in SAM format
cflag="no" # Output sequence in color-space
iflag="no" # Reference genome is already in bowtie index format
kflag="no" # Use ''--col-keepends'' native Bowtie option'
pflag=1 # Number of processors
mflag=-1 # Save appart reads mapping to more than 'mflag' positions (-1 means 'option disabled')
vflag=3 # Mismatches allowed
oflag="mapped_reads" # Base name for output files
fflag=0 # Filter reads mapping to the specified file (FASTA)
lflag=20 # Minimum read length after trimming to be mapped
xflag="no" # Maximum read length before trimming
sflag="no" # It creates an extra output file with reads mapped from length 'uflag' to 'tflag'
uflag=20 # See sflag
tflag=26 # See sflag

# SET PARAMETERS
set -- $args
for i
do
	case "$i" in
		-z) shift;zflag="yes";;
		-b) shift;bflag="yes";;
		-S) shift;Sflag="yes";;
		-c) shift;cflag="yes";;
		-i) shift;iflag="yes";;
		-k) shift;kflag="yes";;
		-s) shift;sflag="yes";;
		-p) shift;pflag="$1";shift;;
		-m) shift;mflag="$1";shift;;
		-v) shift;vflag="$1";shift;;
		-o) shift;oflag="$1";shift;;
		-f) shift;fflag="$1";shift;;
		-l) shift;lflag="$1";shift;;
		-x) shift;xflag="$1";shift;;
		-u) shift;uflag="$1";shift;;
		-t) shift;tflag="$1";shift;;
	esac
done

clear
print_header
echo '* Preparing INPUT files'
ls -1 ${oflag}.tar.gz > /dev/null 2>&1; if [ "$?" = "0" ]; then rm ${oflag}.tar.gz; fi

#COLOR SPACE (default mode)
if [[ $bflag == "no" ]]; then

	# Remove comment and blank lines
	cat ${2} | sed '/^$/d' | grep -v -e'^#'> $TEMP_PATH/$$_tempFile

	# Remove first base from color-space reads (if no COL_KEEPENDS)
	if [[ $kflag != "yes" ]]; then
		cat $TEMP_PATH/$$_tempFile | awk '{if (substr($1,1,1) != ">") print substr($1,2,length $1); else print $0;}' > $TEMP_PATH/$$_temp_unmapped.-1
		rm $TEMP_PATH/$$_tempFile
	else
		mv $TEMP_PATH/$$_tempFile $TEMP_PATH/$$_temp_unmapped.-1
	fi

	# Building indexes if requested
	if [[ $iflag != "yes" ]]; then
		echo '* Building REFERENCE indexes (using bowtie-build)'
		$BOWTIE_PATH/bowtie-build --color ${3} ${3} >/dev/null
	fi

	# Building indexes of filters
	if [[ $fflag != 0 ]]; then
		echo '* Building FILTER indexes (using bowtie-build)'
		$BOWTIE_PATH/bowtie-build --color $fflag $fflag >/dev/null
	fi

# BASE SPACE
else
	# Remove comment and blank lines
	cat ${2} | sed '/^$/d' | grep -v -e'^#'> $TEMP_PATH/$$_temp_unmapped.-1

	# Building indexes if requested
	if [[ $iflag != "yes" ]]; then
		echo '* Building REFERENCE indexes (using bowtie-build)'
		$BOWTIE_PATH/bowtie-build ${3} ${3} >/dev/null
	fi

	# Building indexes of filters
	if [[ $fflag != 0 ]]; then
		echo '* Building FILTER indexes (using bowtie-build)'
		$BOWTIE_PATH/bowtie-build $fflag $fflag >/dev/null
	fi
fi


# MAPPING STEP
echo '* Mapping reads to REFERENCE genome (using bowtie)'

# Set COLOR parameter
if [[ $bflag == "yes" ]]; then
	colorflag=''
else
	colorflag='-C'
	# Set col-keepends flag
	if [[ $kflag == "yes" ]]; then
		colkflag='--col-keepends'
	else
		colkflag=''
	fi
fi

# Set OUTPUT format
if [[ $Sflag == "yes" ]]; then
	samflag='--sam'
else
	samflag=''
fi

# Set OUTPUT sequence
if [[ $cflag == "yes" ]]; then
	coloutflag='--col-cseq'
else
	coloutflag=''
fi


# Set TRIM limit
read_length=`head -2 $TEMP_PATH/$$_temp_unmapped.-1 | tail -1 | awk '{print length}'`
let trim_limit=$read_length-$lflag

if [[ $xflag == "no" ]]; then
	first_trim=$read_length #Default: read length
else
	first_trim=$xflag #User defined
	# MODIFY FIRST UNMAPPED FILE NAME
	let unmapped_file_number=$read_length-$first_trim-1
	mv $TEMP_PATH/$$_temp_unmapped.-1 $TEMP_PATH/$$_temp_unmapped.${unmapped_file_number}
fi

# Remove any previous STATs files
ls -1 ${oflag}.bowtie_stats > /dev/null 2>&1; if [ "$?" = "0" ]; then rm ${oflag}.bowtie_stats; fi

let trim_init=$read_length-$first_trim


for ((i=$trim_init; i<=$trim_limit; i++))
do
	let actual_length=$read_length-$i
	let "i_ant = $i - 1"

	# FILTERING
	if [[ $fflag != 0 ]]; then
		echo >>${oflag}.bowtie_stats
		echo "Filtering reads at  length $actual_length" >>${oflag}.bowtie_stats
		echo -en "\r   - filtering reads at length $actual_length"
echo $BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a --best -v $vflag -3 $i --un $TEMP_PATH/$$_filtered.in -p $pflag $fflag -f $TEMP_PATH/$$_temp_unmapped.$i_ant > $TEMP_PATH/$$_filtered.out.$i 2>>${oflag}.bowtie_stats
#		$BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a --best -v $vflag -3 $i --un $TEMP_PATH/$$_filtered.in -p $pflag $fflag -q $TEMP_PATH/$$_temp_unmapped.$i_ant > $TEMP_PATH/$$_filtered.out.$i 2>>${oflag}.bowtie_stats
#/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_5/README.txt 
#Recommended BOWTIE parameters:
		$BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a -m 10 --best --strata -v $vflag -3 $i --un $TEMP_PATH/$$_filtered.in -p $pflag $fflag -f $TEMP_PATH/$$_temp_unmapped.$i_ant > $TEMP_PATH/$$_filtered.out.$i 2>>${oflag}.bowtie_stats
		mv $TEMP_PATH/$$_filtered.in $TEMP_PATH/$$_temp_unmapped.$i_ant
	fi

	# MAPPING
	echo -en "\r   -   mapping reads at length $actual_length"
	if [[ $mflag == -1 ]]; then
		multiple_mapping_flag=''
	else
		multiple_mapping_flag=" -m $mflag --max $TEMP_PATH/$$_many_maps.$i "
	fi

	echo >>${oflag}.bowtie_stats
	echo "Mapping reads at  length $actual_length" >>${oflag}.bowtie_stats
	echo 	$BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a --best -v $vflag -3 $i --un $TEMP_PATH/$$_temp_unmapped.$i $multiple_mapping_flag -f $pflag ${3} -f $TEMP_PATH/$$_temp_unmapped.$i_ant 
#	$BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a --best -v $vflag -3 $i --un $TEMP_PATH/$$_temp_unmapped.$i $multiple_mapping_flag -p $pflag ${3} -q $TEMP_PATH/$$_temp_unmapped.$i_ant > $TEMP_PATH/$$_reads_mapped.$i 2>>${oflag}.bowtie_stats
#/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_5/README.txt 
#Recommended BOWTIE parameters:
	$BOWTIE_PATH/bowtie $samflag $coloutflag $colorflag $colkflag -a --best --strata -v $vflag -3 $i --un $TEMP_PATH/$$_temp_unmapped.$i $multiple_mapping_flag -p $pflag ${3} -f $TEMP_PATH/$$_temp_unmapped.$i_ant > $TEMP_PATH/$$_reads_mapped.$i 2>>${oflag}.bowtie_stats	
	rm $TEMP_PATH/$$_temp_unmapped.$i_ant

done
echo -en "\r   - Bowtie mapping statistics saved in file: ${oflag}.bowtie_stats"

echo
echo '* Formatting OUTPUT files'


# FORMATTING OUTPUT FILES
if [[ $Sflag == "yes" ]]; then # PREPARE SAM HEADER
	echo "@HD	VN:1.0	SO:unsorted" > $TEMP_PATH/$$_samheader
	cat $TEMP_PATH/$$_reads_mapped.*  | awk '{if (substr($1,1,3) == "@SQ") print $0;}' | sort | uniq >> $TEMP_PATH/$$_samheader
	echo "@PG	ID:SeqTrimMap/Bowtie	CL:'SeqTrimMap $args'" >> $TEMP_PATH/$$_samheader
fi
if [[ $sflag == "yes" ]]; then
	ls -1 ${oflag}.sRNA.bowtie > /dev/null 2>&1; if [ "$?" = "0" ]; then rm ${oflag}.sRNA.bowtie; fi # remove any previous sRNA file
	ls -1 ${oflag}.sRNA.sam > /dev/null 2>&1; if [ "$?" = "0" ]; then rm ${oflag}.sRNA.sam; fi # remove any previous sRNA file
	let sRNA_from=$read_length-$tflag
	let sRNA_to=$read_length-$uflag
	if [[ $Sflag == "yes" ]]; then # SAM headers
		cat $TEMP_PATH/$$_samheader > ${oflag}.sRNA.sam
	fi
	for ((i=sRNA_from; i<=sRNA_to; i++))
	do
		if [[ $Sflag == "yes" ]]; then
			cat $TEMP_PATH/$$_reads_mapped.$i | awk '{if (substr($1,1,1) != "@") print $0;}' >> ${oflag}.sRNA.sam
		else
			cat $TEMP_PATH/$$_reads_mapped.$i >> ${oflag}.sRNA.bowtie
		fi
	done
fi
rm $TEMP_PATH/$$_temp_unmapped.*
ls -1 $TEMP_PATH/$$_filtered.out.* > /dev/null 2>&1; if [ "$?" = "0" ]; then cat $TEMP_PATH/$$_filtered.out.* > ${oflag}.filteredout; rm $TEMP_PATH/$$_filtered.out.*; fi
ls -1 $TEMP_PATH/$$_${fflag}.* > /dev/null 2>&1; if [ "$?" = "0" ]; then rm $TEMP_PATH/$$_${fflag}.*; fi
ls -1 $TEMP_PATH/$$_many_maps.* > /dev/null 2>&1; if [ "$?" = "0" ]; then cat $TEMP_PATH/$$_many_maps.* > ${oflag}.manymaps; rm $TEMP_PATH/$$_many_maps.*; fi
if [[ $Sflag == "yes" ]]; then
	cat $TEMP_PATH/$$_samheader > ${oflag}.sam
	cat $TEMP_PATH/$$_reads_mapped.* | awk '{if (substr($1,1,1) != "@") print $0;}' >> ${oflag}.sam
	rm $TEMP_PATH/$$_samheader
else
	cat $TEMP_PATH/$$_reads_mapped.* > ${oflag}.bowtie
fi
rm $TEMP_PATH/$$_reads_mapped.*
total_reads_mapped=`cat ${oflag}.bowtie_stats | grep -e '^# reads with at least one reported alignment:' -e '# reads with alignments suppressed due to -m: '| awk -F' ' '{sum += $9}END{print sum}'`
total_reads_processed=`grep -e'^>' ${2} | wc -l`
percent_mapped=$(echo "scale=2; 100*$total_reads_mapped/$total_reads_processed" | bc)
if [[ $zflag == "yes" ]]; then
	tar -cf $TEMP_PATH/$$_temp_${oflag}.tar ${oflag}*
	gzip -9 $TEMP_PATH/$$_temp_${oflag}.tar
	rm ${oflag}*
	mv $TEMP_PATH/$$_temp_${oflag}.tar.gz ${oflag}.tar.gz
fi

# END
echo
echo "Total reads mapped: $total_reads_mapped, out of $total_reads_processed ($percent_mapped%)"
echo

#EXIT
exit 0
