setwd("~/giagkas/ASPEAK/IL4_v2/")
r1=read.table("IL4-q20.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")

library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
d1=d
I1=IRanges(d1$Start,d1$End)
R1=Rle(d1$Chr)
S1=Rle(d1$Strand)
meta1=subset(d1, select=c("GeneName", "TranscriptID","customID"))
gr1=GRanges(R1,I1,S1, meta1)


setwd("~/Documents/HuR_PARclip_withHeaders/reference_gtf/")
library(GenomicFeatures)
library(GenomicRanges)
library(rtracklayer)
GTFfile = "Mus_musculus.NCBIM37.64-toMM9.gtf"
txdb <- makeTxDbFromGFF(GTFfile, format="gtf") 

exon=exonsBy(txdb, use.names=T, by="tx")
intron=intronsByTranscript(txdb, use.names=T)
utr3=threeUTRsByTranscript(txdb, use.names=T)
utr5=fiveUTRsByTranscript(txdb, use.names=T)
cds=cdsBy(txdb, use.names=T, by="tx")


x=exon
tt=as.data.frame(findOverlaps(gr1, x))
ttq=as.data.frame(elementMetadata(gr1)$TranscriptID[tt$queryHits])
ttq=cbind(ttq,tt$queryHits)
tts=as.data.frame(names(x)[tt$subjectHits])     
tts=cbind(tts,tt$subjectHits)
tot=cbind(ttq,tts)
tot2=tot[(as.character(tot[,1])==as.character(tot[,3])),]
gr2=gr1[tot2[,2]]


dd1=as.data.frame(elementMetadata(gr2)$customID)
colnames(dd1)="customID"



x=intron
tt=as.data.frame(findOverlaps(gr1, x))
ttq=as.data.frame(elementMetadata(gr1)$TranscriptID[tt$queryHits])
ttq=cbind(ttq,tt$queryHits)
tts=as.data.frame(names(x)[tt$subjectHits])     
tts=cbind(tts,tt$subjectHits)
tot=cbind(ttq,tts)
tot2=tot[(as.character(tot[,1])==as.character(tot[,3])),]
gr2=gr1[tot2[,2]]


dd2=as.data.frame(elementMetadata(gr2)$customID)
colnames(dd2)="customID"



x=utr3
tt=as.data.frame(findOverlaps(gr1, x))
ttq=as.data.frame(elementMetadata(gr1)$TranscriptID[tt$queryHits])
ttq=cbind(ttq,tt$queryHits)
tts=as.data.frame(names(x)[tt$subjectHits])     
tts=cbind(tts,tt$subjectHits)
tot=cbind(ttq,tts)
tot2=tot[(as.character(tot[,1])==as.character(tot[,3])),]
gr2=gr1[tot2[,2]]


dd3=as.data.frame(elementMetadata(gr2)$customID)
colnames(dd3)="customID"



x=utr5
tt=as.data.frame(findOverlaps(gr1, x))
ttq=as.data.frame(elementMetadata(gr1)$TranscriptID[tt$queryHits])
ttq=cbind(ttq,tt$queryHits)
tts=as.data.frame(names(x)[tt$subjectHits])     
tts=cbind(tts,tt$subjectHits)
tot=cbind(ttq,tts)
tot2=tot[(as.character(tot[,1])==as.character(tot[,3])),]
gr2=gr1[tot2[,2]]


dd4=as.data.frame(elementMetadata(gr2)$customID)
colnames(dd4)="customID"


x=cds
tt=as.data.frame(findOverlaps(gr1, x))
ttq=as.data.frame(elementMetadata(gr1)$TranscriptID[tt$queryHits])
ttq=cbind(ttq,tt$queryHits)
tts=as.data.frame(names(x)[tt$subjectHits])     
tts=cbind(tts,tt$subjectHits)
tot=cbind(ttq,tts)
tot2=tot[(as.character(tot[,1])==as.character(tot[,3])),]
gr2=gr1[tot2[,2]]


dd5=as.data.frame(elementMetadata(gr2)$customID)
colnames(dd5)="customID"




dd=d1
dd$exon=d1$customID%in%dd1$customID
summary(dd$exon)
dd$intron=d1$customID%in%dd2$customID
summary(dd$intron)
dd$utr3=d1$customID%in%dd3$customID
summary(dd$utr3)
dd$utr5=d1$customID%in%dd4$customID
summary(dd$utr5)
dd$cds=d1$customID%in%dd5$customID
summary(dd$cds)

#library(qvalue)
#qv=qvalue(dc$p.Value, lambda = seq(0, 0.009980, 0.05))
#The expected range of p-values is (0-1], and the fact that you are missing a 
#whole quartile of expected values is throwing the pi0est() function for a loop.
#plot(qv)

setwd("~/giagkas/ASPEAK/IL4_v2/")
write.table(dd, file = "IL4-q20.peaks_annot_v3.txt", quote=F, row.names = F, sep = "\t" ) 
