rulebaseRIPSeek <- function(bamFiles, cNAME, featureGRanges, rpkmCutoff=0.4, fcCutoff=3, moreRIPGeneInfo=TRUE, idType="ensembl_transcript_id", myMin=.Machine$double.xmin, saveRData, ...) { ##################### Collect BAM files ##################### bamFilesRIP <- grep(cNAME, bamFiles, invert=TRUE, value=TRUE) bamFilesCTL <- grep(cNAME, bamFiles, invert=FALSE, value=TRUE) ##################### Compute RPKM ##################### if(!missing(featureGRanges)) { nRPKM <- computeRPKM(bamFiles=bamFilesRIP, justRPKM=TRUE, featureGRanges=featureGRanges, ...) dRPKM <- computeRPKM(bamFiles=bamFilesCTL,featureGRanges=featureGRanges, ...) } else { nRPKMList <- computeRPKM(bamFiles=bamFilesRIP, justRPKM=FALSE, ...) nRPKM <- nRPKMList$rpkmSEobj featureGRanges <- nRPKMList$featureGRanges dRPKM <- computeRPKM(bamFiles=bamFilesCTL, featureGRanges=featureGRanges,...) } ##################### Compute Foldchange ##################### RIP_rpkm <- metadata(nRPKM)[["rpkm"]] CTL_rpkm <- metadata(dRPKM)[["rpkm"]] foldchange <- (RIP_rpkm + myMin) / (CTL_rpkm + myMin) ##################### Rule-based Threshold ##################### # create a data.frame for easy viewing rpkmDF <- data.frame(RIP_count=unlist(assays(nRPKM)), RIP_rpkm=RIP_rpkm, CTL_count=unlist(assays(dRPKM)), CTL_rpkm=CTL_rpkm, foldchange=foldchange, row.names=names(rowRanges(nRPKM)), check.names=FALSE) rule <- (rpkmDF$RIP_rpkm >= rpkmCutoff) * (rpkmDF$foldchange >= fcCutoff) rpkmDF.passed <- rpkmDF[which(rule == 1), ] ##################### Annotate Features ##################### if(moreRIPGeneInfo) { featureID <- rownames(rpkmDF.passed) # hack useMart to ignore unused arguments formals(useMart) <- c(formals(useMart), alist(... = )) mart <- useMart(...) geneInfo <- getBM(mart=mart, attributes=c("chromosome_name", "start_position", "end_position", "strand", "external_gene_name", "ensembl_transcript_id", "ensembl_gene_id", "ucsc", "description"), filters=idType, values = featureID) geneInfo <- geneInfo[match(featureID, geneInfo[,idType]),] # form data.frame in bed format rpkmDF.annotated <- cbind( geneInfo[,c(1,2,3)], rpkmDF.passed[, c(2,5)], geneInfo[,4, drop=FALSE], rpkmDF.passed[, -c(2,5)], geneInfo[, -c(1:4)] ) rownames(rpkmDF.annotated) <- rownames(rpkmDF.passed) if(!("chr" %in% rpkmDF.annotated$chromosome_name)) { rpkmDF.annotated$chromosome_name <- paste("chr", rpkmDF.annotated$chromosome_name, sep="") } if(is.integer(rpkmDF.annotated$strand)) { rpkmDF.annotated$strand <- sub("^1$", "+", rpkmDF.annotated$strand) rpkmDF.annotated$strand <- sub("-1$", "-", rpkmDF.annotated$strand) } rpkmDF <- rpkmDF.annotated } ##################### Output ##################### ruleBasedRIPSeekResults <- list(nRPKM=nRPKM, dRPKM=dRPKM, rpkmDF=rpkmDF, rpkmCutoff=rpkmCutoff, fcCutoff=fcCutoff, featureGRanges=featureGRanges) if(!missing(saveRData)) { save(ruleBasedRIPSeekResults, file=saveRData) write.table(rpkmDF.annotated, file=sub("RData$", "txt", saveRData), sep="\t", quote=F, row.names=F) } return(ruleBasedRIPSeekResults) } computeRPKM <- function(bamFiles, RIPSeekerRead=TRUE, paired=FALSE, countMode="IntersectionNotEmpty", featureGRanges, idType="ensembl_transcript_id", featureType="exon", ignore.strand=FALSE, txDbName="biomart", moreGeneInfo=FALSE, saveData, justRPKM=TRUE, ...) { library(Rsamtools) library(GenomicFeatures) library(biomaRt) ##################### Make transcript db ##################### if(missing(featureGRanges)) { stopifnot(txDbName %in% c("biomart", "UCSC")) stopifnot(featureType %in% c("exon", "intron", "fiveUTR", "threeUTR", "CDS")) stopifnot(list(...)$by %in% c("tx", "gene")) # hack functions to ignore unused arguments formals(makeTxDbFromBiomart) <- c(formals(makeTxDbFromBiomart), alist(... = )) formals(makeTxDbFromUCSC) <- c(formals(makeTxDbFromUCSC), alist(... = )) if(txDbName == "biomart") txDb <- makeTxDbFromBiomart(...) if(txDbName == "UCSC") txDb <- makeTxDbFromUCSC(...) # group annotation by transcripts ("tx") or gene if(list(...)$by == "tx") { if(featureType == "exon") featureGRanges <- exonsBy(txDb, by="tx", use.names=TRUE) if(featureType == "CDS") featureGRanges <- cdsBy(txDb, by="tx", use.names=TRUE) } else { if(featureType == "exon") featureGRanges <- exonsBy(txDb, by="gene") if(featureType == "CDS") featureGRanges <- cdsBy(txDb, by="gene") if(featureType == "intron") featureGRanges <- intronsByTranscript(txDb, use.names=TRUE) if(featureType == "fiveUTR") featureGRanges <- fiveUTRsByTranscript(txDb,use.names=TRUE) if(featureType == "threeUTR") featureGRanges <- threeUTRsByTranscript(txDb,use.names=TRUE) } # ensembl uses numericals for chromosome ID (e.g. 1 rather than chr1) # add chr to it to be consistent with the alignment if(txDbName == "biomart") seqlevels(featureGRanges) <- paste("chr", seqlevels(featureGRanges), sep="") } else { stopifnot(class(featureGRanges) %in% c("character", "GRanges", "GRangesList")) } # assuming featureGRanges is the file name of tab-delim data if(!missing(featureGRanges) && is.character(featureGRanges)) { featureGRanges <- as(read.delim(featureGRanges), "GRanges") } ##################### Read BAM ##################### if(RIPSeekerRead) { # read alignment files using RIPSeeker built-in function aligns <- combineAlignGals(bamFiles, ...) } else { # read by directly calling required functions aligns <- NULL message(sprintf("%d BAM files are combined", length(bamFiles))) for(bamFile in bamFiles) { if(paired) { # paired-end param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE, isPaired=paired, hasUnmappedMate=FALSE, isProperPair=paired, isDuplicate=FALSE)) if(is.null(aligns)) aligns <- galp2gal(readGAlignmentPairs(bamFile, param=param)) if(!is.null(aligns)) aligns <- append(aligns, galp2gal(readGAlignmentPairs(bamFile, param=param))) } else { # single-end param <- ScanBamParam(flag=scanBamFlag(isUnmappedQuery=FALSE, isDuplicate=FALSE)) if(is.null(aligns)) aligns <- readGAlignments(bamFile, param=param) if(!is.null(aligns)) aligns <- append(aligns, readGAlignments(bamFile, param=param)) } } } ##################### Compute RPKM ##################### # count reads to return RangedSummarizedExperiment object rpkmSEobj <- summarizeOverlaps(features=featureGRanges, reads=aligns, mode=countMode, ignore.strand=ignore.strand) # get counts counts <- unlist(assays(rpkmSEobj)) # compute the 'K' in RPKM numBases <- sum(width(featureGRanges)) geneLengthsInKB <- numBases / 1e3 # comptue the M in RPKM millionsMapped <- length(aligns) / 1e6 # per million of total mapped reads # counted reads / total reads in millions rpm <- counts / millionsMapped # reads per million per geneLength in Kb rpkm <- rpm / geneLengthsInKB # save rpkm in metadata slot of the RangedSummarizedExperiment object metadata(rpkmSEobj) <- list(rpkm=rpkm) ##################### Save as data.frame ##################### # create a data.frame for easy viewing rpkmDF <- data.frame(count=counts, rpkm=rpkm, totalExonLength=numBases, row.names=names(rowRanges(rpkmSEobj)), check.names=FALSE) if(!paired) names(rpkmDF) <- c("counts", "rpkm", "totalExonLength") if(paired) names(rpkmDF) <- c("counts", "fpkm", "totalExonLength") ##################### Annotate Features ##################### if(moreGeneInfo) { featureID <- rownames(rpkmDF) # hack useMart to ignore unused arguments formals(useMart) <- c(formals(useMart), alist(... = )) mart <- useMart(...) geneInfo <- getBM(mart=mart, attributes=c("chromosome_name", "start_position", "end_position", "strand", "external_gene_name", "ensembl_transcript_id", "ensembl_gene_id", "ucsc", "description"), filters=idType, values = featureID) geneInfo <- geneInfo[match(featureID, geneInfo[,idType]),] # form data.frame in bed format rpkmDF.annotated <- cbind(geneInfo[,c(1,2,3,5)], totalExonLength=rpkmDF$totalExonLength, geneInfo[,4, drop=FALSE], uniqueMapCount=rpkmDF$counts, ensembl_gene_id=geneInfo$ensembl_gene_id, geneLength=abs(geneInfo$end - geneInfo$start), description=geneInfo$description) if(!paired) rpkmDF.annotated$RPKM <- rpkmDF$rpkm if(paired) rpkmDF.annotated$FPKM <- rpkmDF$fpkm rownames(rpkmDF.annotated) <- rownames(rpkmDF) if(!("chr" %in% rpkmDF.annotated$chromosome_name)) { rpkmDF.annotated$chromosome_name <- paste("chr", rpkmDF.annotated$chromosome_name, sep="") } if(is.integer(rpkmDF.annotated$strand)) { rpkmDF.annotated$strand <- sub("^1$", "+", rpkmDF.annotated$strand) rpkmDF.annotated$strand <- sub("-1$", "-", rpkmDF.annotated$strand) } rpkmDF <- rpkmDF.annotated } ##################### Outputs ##################### if(!missing(saveData)) save(rpkmDF, file=saveData) if(justRPKM) { ruleBasedRIPSeekResults <- rpkmSEobj } else { ruleBasedRIPSeekResults <- list(rpkmSEobj=rpkmSEobj, rpkmDF=rpkmDF, featureGRanges=featureGRanges) } return(ruleBasedRIPSeekResults) } txDbName <- "biomart" biomart <- "ENSEMBL_MART_ENSEMBL" # use archive to get ensembl 65 # dataset <- "mmusculus_gene_ensembl" # host <- "dec2011.archive.ensembl.org" # use ensembl 65 for annotation host="grch37.ensembl.org" dataset="hsapiens_gene_ensembl" resultlist <- rulebaseRIPSeek(bamFiles, cNAME, dataset=dataset, txDbName=txDbName, biomart=biomart, host=host, by="tx",moreRIPGeneInfo=FALSE,moreGeneInfo=FALSE) bamFiles [1] "PHR23r-FUBP2.bam" "PHR24r-IgG.bam" > outDir [1] "/mnt/max/c/genomics_facility/proton/run341/www/RIPSeeker_FUBP2_vs_IgG_bothStrands" > write.table(df, file="/mnt/max/c/genomics_facility/proton/run341/www/RIPSeeker_FUBP2_vs_IgG_RPKM.csv", append=FALSE, quote=TRUE, sep = ";", eol = "\n", na = "NA", dec = ".", row.names=TRUE, col.names=TRUE, qmethod = c("escape", "double"), fileEncoding = "") MALAT: "ENST00000534336";17651;288.05227194245;116;1.53472476463235;187.689857217789 (/ 288.05 187.69) 1.5347114923544143