#!/bin/bash ###################### # script extract_sites - given a list of significant T-Cs, extract the # binding sites # input: intersect_file - .bed file sorted by the mismatch position and read start # output_file - file where the read groups are to be saved # refine_cov - minimum coverage at the tails of a group (tails below this coverage will be trimmed) # # output: file containing 4 columns: chromosome, start, end, strand # # Written by Monica Golumbeanu # monica.golumbeanu@bsse.ethz.ch ##################### BEGIN{ chr="NULL" prev_e=0 t=0 } { # while on the same chromosome if ($1==chr){ # if the new read overlaps with the previous one if ($7 refine_cov) { # write the binding site to the file for (i=0; i<=refine_cov; i++) { tail[i+1] = read_e[t-i] } n = asort(tail) # chrom start end strand if (read_s[refine_cov+1]+1 < tail[1]+1 ) { print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file } } delete read_e delete read_s delete tail t = 1 read_s[t] = $7 read_e[t] = $8 prev_e = $8 } # if different chromosome } else { # if the buffer is still full from the previous chromosme if (t > refine_cov) { for (i=0; i<=refine_cov; i++) { tail[i+1] = read_e[t-i] } n = asort(tail) # chrom start end strand if (read_s[refine_cov+1]+1 < tail[1]+1 ) { print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file } } chr=$1 prev_e = $8 delete read_e delete read_s delete tail t=1 read_s[t] = $7 read_e[t] = $8 } } END { # if the buffer is still full if (t > refine_cov) { # write the group to the file for (i=0; i<=refine_cov; i++) { tail[i+1] = read_e[t-i] } n = asort(tail) # chrom start end strand if (read_s[refine_cov+1]+1 < tail[1]+1 ) { print chr "\t" read_s[refine_cov+1]+1 "\t" tail[1]+1 "\t" 0 "\t" 0 "\t" $6 > output_file } delete read_e delete tail prev_e = 0 t = 0 } close(output_file) }