#IL4
#read differ expression table
setwd("~/giagkas/RNAseq_POLYseq/")
de=read.table("RNA-Poly_IL4total.txt", header = T)

#read aspeak
setwd("~/giagkas/ASPEAK/IL4_v2/")
dc=read.table("IL4-q20.peaks_annot_v3.txt", header = T)

#merge
m=merge(de, dc, by="GeneName", all = T)

#filter protein coding, wtRPKM RNA >1
mf1=subset(m, (m$wtRPKM.RNA>1)&(m$RPKM>1)&(m$Biotype.x=="protein_coding"))


n_occur <- data.frame(table(mf1$GeneName))
n_occur[n_occur$Freq> 1,]
n_occur=subset(n_occur, n_occur$Freq>0)
summary(n_occur$Freq)

# create custome coordID and keep only one per gene between those genomic coordinates that match (prefenrence tag count>RPKM)
# !!this is for exact match, not overlaping sites !! 
df1=within(mf1,   coordID<- paste(Chr,Start,End,  sep="_"))
df2=df1[ order(df1$coordID, df1$Tag_Count, df1$RPKM, decreasing = TRUE), ]
dd=subset(df2, !duplicated(df2$coordID))


#######
#IL4 piechart for genomic distribution of binding sites(clusters).
#####
d=dd
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)


df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)

setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_peaks_IL4_proteincoding.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Binding Sites")
dev.off()


#filter for differentially expressed genes (abundance), pval<0.05, -1.5>FC>1.5
d=subset(dd, (dd$PValue.RNA<0.05)&(abs(dd$logFC.RNA)>0.5849625)&(dd$wtRPKM.RNA>1))
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)

df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)

setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_peaks_IL4_prcod_Abund_Signfic.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Binding Sites Abundance")
dev.off()


#filter for differentially expressed genes (polysomes), pval<0.05, -1.5>FC>1.5
d=subset(dd, (dd$PValue.POLY<0.05)&(abs(dd$logFC.POLY)>0.5849625)&(dd$wtRPKM.RNA>1)&(dd$wtRPKM.POLY>1))
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)

df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)

setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_peaks_IL4_prcod_Poly_Signfic.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Binding Sites Translation")
dev.off()


##### gene wise
#sort (order ) by 3UTR, CDS,5UTR, intron and remove duplicates keeping the first peak per gene.
dd2=dd[ order(dd$GeneName, dd$utr3, dd$cds, dd$utr5, dd$intron , decreasing = TRUE), ]
dd2=subset(dd2, !duplicated(dd2$GeneName))

d=dd2
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)

df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)

setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_Gene-wise_IL4_proteincoding.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Genes")
dev.off()


#filter for differentially expressed genes (abundance), pval<0.05, -1.5>FC>1.5
d=subset(dd2, (dd2$PValue.RNA<0.05)&(abs(dd2$logFC.RNA)>0.5849625)&(dd2$wtRPKM.RNA>1))
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)

df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)
setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_Genewise_IL4_prcod_Abund_Signif.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Genes Abundance")
dev.off()


#filter for differentially expressed genes (polysomes), pval<0.05, -1.5>FC>1.5
d=subset(dd2, (dd2$PValue.POLY<0.05)&(abs(dd2$logFC.POLY)>0.5849625)&(dd2$wtRPKM.RNA>1)&(dd2$wtRPKM.POLY>1))
n=nrow(d)
d1=subset(d, d$utr3=="TRUE")
n1=nrow(d1)
d2=subset(d, (d$utr3=="FALSE")&(d$cds=="TRUE"))
n2=nrow(d2)
d3=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="TRUE"))
n3=nrow(d3)
d4=subset(d, (d$utr3=="FALSE")&(d$cds=="FALSE")&(d$utr5=="FALSE")&(d$intron=="TRUE"))
n4=nrow(d4)

df <- data.frame(
  group = c("3'UTR", "CDS", "5'UTR", "Intron"),
  value = c(n1, n2, n3, n4)
)

setwd("~/giagkas/ASPEAK/IL4_v2/genomicDistribution/")
png(filename = "GenomicDistr_Gene-wise_IL4_prcod_Translation.png", width = 1000, height = 1000)
par(lwd=3,cex=3.0,font=2,cex.axis=1.2, font.lab=2, font.axis=2, cex.lab=2, cex.main=1.8, font.main=1)
pct =round(df$value/sum(df$value)*100, digits = 1)
lbls <- paste(df$group, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(df$value,labels = lbls, col=gray.colors(length(lbls)),main="IL4 Genes Translation")
dev.off()