setwd("~/giagkas/ASPEAK/IL4_v2/")
r1=read.table("IL4-q20.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
#check duplicates coordinates
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]
#check duplicated custom peak ID
n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d1=d

##### exon
setwd("~/giagkas/ASPEAK/IL4_v2/")
r1=read.table("exon.reg.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]

n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d2=d







##### intron
setwd("~/giagkas/ASPEAK/IL4_v2/")
r1=read.table("intron.reg.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]

n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d3=d

#combine


d1$exon=d1$customID%in%d2$customID
summary(d1$exon)
d1$intron=d1$customID%in%d3$customID
summary(d1$intron)

test=subset(d1, (d1$exon=="FALSE")&(d1$intron=="FALSE"))


write.table(d1, file="IL4_ASpeak_q20_locationAnnot.txt", quote = F, row.names = F, sep = "\t")

colnames(d1)
summary(d1$Span_Size)
dn=density(d1$Span_Size)
plot(dn, main="span Size - IL4 peaks", xlab="Span Size")
hist(d1$Tag_Count, main="Tag counts - IL4 peaks",breaks=100000, xlab="Tag counts", xlim=c(0,100))
hist(d1$Maximum_Height, main="Maxim. height- IL4 peaks",breaks=100000, xlab="Max height", xlim=c(0,100))
summary(d1$`p-Value`)
hist(d1$`p-Value`, main = "pvalue")
summary(d1$FDR)
summary(d1$Tag_Count)
summary(d1$RPKM)

dr=subset(d1, d1$RPKM>=1)

summary(dr$Tag_Count)

d1[11641,]



setwd("~/Documents/HuR_PARclip_withHeaders/reference_gtf/")
library(GenomicFeatures)
library(GenomicRanges)
library(rtracklayer)
GTFfile = "Mus_musculus.NCBIM37.64-toMM9.gtf"
txdb <- makeTxDbFromGFF(GTFfile, format="gtf") 
utr5=fiveUTRsByTranscript(txdb, use.names=T)
utr3=threeUTRsByTranscript(txdb, use.names=T)

I0=IRanges(d1$Start,d1$End)
R0=Rle(d1$Chr)
S0=Rle(d1$Strand)
meta0=subset(d1, select=c("GeneName", "TranscriptID","customID"))
gr0=GRanges(R0,I0,S0, meta0)

t1=subsetByOverlaps(gr0, utr3)
n_occur <- data.frame(table(elementMetadata(t1)$TranscriptID))
n_occur[n_occur$Freq> 1,]

t2=subsetByOverlaps(gr0, utr5)
n_occur <- data.frame(table(elementMetadata(t2)$TranscriptID))
n_occur[n_occur$Freq> 1,]


t3=subsetByOverlaps(gr0,cds(txdb))
n_occur <- data.frame(table(elementMetadata(t3)$TranscriptID))
n_occur[n_occur$Freq> 1,]

dd1=as.data.frame(elementMetadata(t1)$customID)
colnames(dd1)="customID"
dd2=as.data.frame(elementMetadata(t2)$customID)
colnames(dd2)="customID"
dd3=as.data.frame(elementMetadata(t3)$customID)
colnames(dd3)="customID"

dd=d1
dd$utr3=d1$customID%in%dd1$customID
summary(dd$utr3)
dd$utr5=dd$customID%in%dd2$customID
summary(dd$utr5)
dd$cds=dd$customID%in%dd3$customID
summary(dd$cds)
setwd("~/giagkas/ASPEAK/IL4_v2/")
write.table(dd, file = "IL4-q20.peaks_annot_v2.txt", quote=F, row.names = F, sep = "\t" )

