#!/bin/bash
################################
#
# script extract_mism: starting from a .pileup file created with samtools mpileup it extracts mismatch information;
#                   it creates 12 files corresponding to the possible mismatches on two strands
#
# Written by Monica Golumbeanu
# monica.golumbeanu@bsse.ethz.ch
#################################

BEGIN {
print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"TC_f.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"TC_f.txt")

print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"AC_f.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"AC_f.txt")

print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"GC_f.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"GC_f.txt")

print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"AG_r.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"AG_r.txt")

print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"TG_r.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"TG_r.txt")

print "#Chr" "\t" "Position" "\t" "X" "\t" "Y" > (path"CG_r.txt")
print "#chr" "\t" "0" "\t" "0" "\t" "0" >> (path"CG_r.txt")

print "Output mismatch summary files opened in " path
}

{
######## EXTRACT THE READ SEQUENCE #######
read_seq=$5
# replace the end of read marker
gsub(/\$/,"",read_seq);
# replace the start of read marker
gsub(/\^./,"",read_seq);

# remove the insertion markers
while (index(read_seq,"+")>0) {
    pos = index(read_seq,"+")
    first_part = substr(read_seq, 1, pos-1)
    second_part = substr(read_seq, pos+1)
    nb = ""
    # extract the number of inserted nucleotides
    while (substr(second_part,1,1) ~ /^[0-9]+$/) {
        nb = nb "" substr(second_part,1,1)
        second_part = substr(second_part,2)
    }
    # remove the insterted nucleotides
    read_seq = first_part "" substr(second_part,nb+1)
}

# remove the deletion markers
while (index(read_seq,"-")>0) {
    pos = index(read_seq,"-")
    first_part = substr(read_seq, 1, pos-1)
    second_part = substr(read_seq, pos+1)
    nb = ""
    # extract the number of deleted characters
    while (substr(second_part,1,1) ~ /^[0-9]+$/) {
        nb = nb "" substr(second_part,1,1)
        second_part = substr(second_part,2)
    }
    # remove the deleted nucleotides
    read_seq = first_part "" substr(second_part,nb+1)
}

####### EXTRACT THE MISMATCH PROFILE ######
matches_f = gsub("\\.","-",read_seq)
matches_r = gsub(",",",",read_seq)

nb_A = gsub("A","A",read_seq)
nb_C = gsub("C","C",read_seq)
nb_G = gsub("G","G",read_seq)
nb_T = gsub("T","T",read_seq)
nb_N = gsub("N","N",read_seq)
nb_a = gsub("a","a",read_seq)
nb_c = gsub("c","c",read_seq)
nb_g = gsub("g","g",read_seq)
nb_t = gsub("t","t",read_seq)
nb_n = gsub("n","n",read_seq)

cov_f = matches_f + nb_A + nb_C + nb_G + nb_T + nb_N
cov_r = matches_r + nb_a + nb_c + nb_g + nb_t + nb_n

####### PRINT TO THE FILES #######
# for the forward strand
if (cov_f>=cov_min) {
    if ($3 == "A" || $3 == "a") {
        print $1 "\t" $2 "\t" nb_C "\t" cov_f >> (path"AC_f.txt")
    }
    else if ($3 == "G" || $3 == "g") {
        print $1 "\t" $2 "\t" nb_C "\t" cov_f >> (path"GC_f.txt")
    }
    else if ($3 == "T" || $3 == "t") {
        print $1 "\t" $2 "\t" nb_C "\t" cov_f >> (path"TC_f.txt")
    }
}

# for the reverse strand
if (cov_r>=cov_min) {
    if ($3 == "A" || $3 == "a") {
        print $1 "\t" $2 "\t" nb_g "\t" cov_r >> (path"AG_r.txt")
    }
    else if ($3 == "C" || $3 == "c") {
        print $1 "\t" $2 "\t" nb_g "\t" cov_r >> (path"CG_r.txt")
    }
    else if ($3 == "T" || $3 == "t") {
        print $1 "\t" $2 "\t" nb_g "\t" cov_r >> (path"TG_r.txt")
    }
}

}
END {
# close all the files
close(path"AC_f.txt")
close(path"GC_f.txt")
close(path"TC_f.txt")
close(path"AG_r.txt")
close(path"CG_r.txt")
close(path"TG_r.txt")
}