#!/bin/bash

# ******REQUIRES SAMTOOLS******

#This is a sam assembly collapser using coordinates and strand as the "unique insert" identifiers, keeping the read with the highest mapping quality
#Written/assembled by Jake Enk and Alison Devault (and named by Nathalie Mouttham) June 3, 2013

# Usage: $ aweSAM_collapser [input samfile] [output collapsed samfile]

# PROCESS:

#1. Start with sam file as first variable, output file as second variable
inputsam=$1
collsam=$2

#2. Reduce the samfile to mapped reads only
awk '$3 != "*"' $inputsam |

#3. Convert mapped sam to bed using sam2bed from BEDOPS script package (by Eric Haugen and Alex Reynolds, version 2.2 (http://code.google.com/p/bedops/))
samtools view -S -\
        | gawk -F "\t" '{ \
            if ( ! and(4, $2) ) { \
                sval = $6; \
                gsub("[0-9]+[ISHP]", "", sval ); \
                gsub("[MDN=X]", "|", sval); \
                num = split(sval, a, "|"); \
                lng = 0; \
                for ( i = 1; i <= num; ++i ) { \
                    lng += int(a[i]); \
                } \
                printf "%s\t%s\t%d\t%s\t%s", $3, int($4) - 1, int($4) - 1 + lng, $1, $2; \
                if ( and(16, $2) ) { printf "\t-"; } else { printf "\t+"; } \
                for ( i = 5; i <= NF; ++i ) { \
                    printf "\t%s", $(i); \
                } \
                printf "\n"; \
            } \
        }' | 

#4. Sort so that the -u unique collapse function will keep the read with the highest mapping quality
#    NOTE: -k2,2n sorts by 5' position, -k3,3n by 3' position, and -k6,6 by strand (+ or -); include or exclude these as desired

sort -k 7,7nr - | sort -k1,1 -k2,2n -k3,3n -k6,6 -u |

#5. Cut out names of kept sequences; find and extract these from the original sam file, writing to a temporary file

cut -f 4 - | fgrep -f - $inputsam > temporary.file;

#6. Append original sam header to make a functioning collapsed sam
grep "^@" $inputsam | cat - temporary.file > $collsam

#7. Cleanup
rm temporary.file; 

#8. Calculate collapse rate (updated 2016-June-10, Jake Enk)
original=$(awk '$3 != "*" {print $1}' $inputsam | sort | uniq | wc -l); collapsed=$(awk '$3 != "*" {print $1}' $collsam | sort | uniq | wc -l); duplicates=$(echo $original-$collapsed | bc); dupprop=$(echo "scale=4; $duplicates/$original" | bc); duprate=$(echo "scale=2; $dupprop*100/1" | bc)

#9. Report
echo $1" collapsed to "$2"; "$duprate"% were duplicates"
