setwd("~/giagkas/ASPEAK/IL4/")
r1=read.table("IL4-q25.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
#check duplicates coordinates
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]
#check duplicated custom peak ID
n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d1=d

##### 3UTR
setwd("~/giagkas/ASPEAK/IL4/")
r1=read.table("utr3.reg.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]

n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d2=d







##### intron
setwd("~/giagkas/ASPEAK/IL4/")
r1=read.table("intron.reg.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]

n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d3=d







##### 5utr
setwd("~/giagkas/ASPEAK/IL4/")
r1=read.table("utr5.reg.peaks.txt")
colnames(r1)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR")



library(dplyr)
library(tidyr)
colnames(df)
t=separate(r1,3, c( "TranscriptID", "GeneName", "Biotype"), "\\|")
df=cbind(r1, t$TranscriptID, t$GeneName, t$Biotype)
colnames(df)=c("Chr",	"Strand",	"Interval","Interval_Length",
               "Tag_Count",	"Maximum_Height",	"Span_Size",	"Start",	"End",	"RPKM",	"Weighted_Center",	"p-Value",	"FDR",
               "TranscriptID", "GeneName", "Biotype")


d <- within(df,   Coord<- paste(Chr,Start,End,  sep="_"))
d <- within(df,   customID<- paste(TranscriptID,Chr,Start,End,  sep="_"))
n_occur <- data.frame(table(d$Coord))
n_occur[n_occur$Freq> 1,]

n_occur <- data.frame(table(d$customID))
n_occur[n_occur$Freq> 1,]

d4=d



#################

#combine


d1$X3UTR=d1$customID%in%d2$customID
summary(d1$X3UTR)
d1$intron=d1$customID%in%d3$customID
summary(d1$intron)
d1$X5UTR=d1$customID%in%d4$customID
summary(d1$X5UTR)

test=subset(d1, (d1$X3UTR=="FALSE")&(d1$X5UTR=="FALSE")&(d1$intron=="FALSE"))


write.table(d1, file="IL4_ASpeak_q25_locationAnnot.txt", quote = F, row.names = F, sep = "\t")

summary(d1$Span_Size)

