#!/bin/bash
######################
# script extract_sites - given a list of significant T-Cs, extract the
#                        binding sites
# input: intersect_file - .bed file sorted by the mismatch position and read start
#        output_file - file where the read groups are to be saved
#        refine_cov - minimum coverage at the tails of a group (tails below this coverage will be trimmed)
#
# output: file containing 4 columns: chromosome, start, end, strand
#
# Written by Monica Golumbeanu
# monica.golumbeanu@bsse.ethz.ch
#####################

BEGIN{
chr="NULL"
prev_e=0
t=0
}

{
# while on the same chromosome
if ($1==chr){
    # if the new read overlaps with the previous one
    if ($7<prev_e) {
        t = t + 1
        read_s[t] = $7
        read_e[t] = $8
        prev_e = $8
    } else {
        if (t > refine_cov) {
            # write the binding site to the file
            for (i=0; i<=refine_cov; i++) {
                tail[i+1] = read_e[t-i]
            }
            n = asort(tail)
            # chrom start end strand
            if (read_s[refine_cov+1]+1 < tail[1]+1 ) {
                print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file
            }
        }
        delete read_e
        delete read_s
        delete tail
        t = 1
        read_s[t] = $7
        read_e[t] = $8
        prev_e = $8
    }
# if different chromosome
} else {
    # if the buffer is still full from the previous chromosme
    if (t > refine_cov) {
        for (i=0; i<=refine_cov; i++) {
            tail[i+1] = read_e[t-i]
        }
        n = asort(tail)
        # chrom start end strand
        if (read_s[refine_cov+1]+1 < tail[1]+1 ) {
            print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file
        }
    }
    chr=$1
    prev_e = $8
    delete read_e
    delete read_s
    delete tail
    t=1
    read_s[t] = $7
    read_e[t] = $8
}

}

END {
# if the buffer is still full
if (t > refine_cov) {
    # write the group to the file
    for (i=0; i<=refine_cov; i++) {
        tail[i+1] = read_e[t-i]
    }
    n = asort(tail)
    # chrom start end strand
    if (read_s[refine_cov+1]+1 < tail[1]+1 ) {
        print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file
    }
    delete read_e
    delete tail
    prev_e = 0
    t = 0
}
close(output_file)
}