cd /data/images/proton/DKlab/orsalia2/stringtie
/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts -O -g gene_name -t exon -T 8 -a /data/images/proton/DKlab/orsalia2/stringtie/merged.gtf3 -o gene_counts_merged3_featurecounts.txt /data/images/proton/DKlab/orsalia/sortedwt01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt02pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt21pos.bam  /data/images/proton/DKlab/orsalia/sortedwt22pos.bam  /data/images/proton/DKlab/orsalia/sortedwt61pos.bam  /data/images/proton/DKlab/orsalia/sortedwt62pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN1pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN2pos.bam  /data/images/proton/DKlab/orsalia/sortedwt01IL4pos.bam /data/images/proton/DKlab/orsalia/sortedwt02IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedKo02pos.sorted.bam   /data/images/proton/DKlab/orsalia/sortedKO21pos.bam  /data/images/proton/DKlab/orsalia/sortedKO22pos.bam  /data/images/proton/DKlab/orsalia/sortedKo61pos.bam  /data/images/proton/DKlab/orsalia/sortedKo62pos.bam  /data/images/proton/DKlab/orsalia/sortedKOIFN1pos.bam /data/images/proton/DKlab/orsalia/sortedKOIFN2pos.bam /data/images/proton/DKlab/orsalia/sortedKo1IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo2IL4pos.bam

require(edgeR)
#wrong FC sign: wt vs ko, correct is ko vs wt 
rnaseqMatrix1= read.table("gene_counts_merged3_featurecounts.txt",header=T,sep="\t",skip=1)
rnaseqMatrix0 =rnaseqMatrix1[,7:dim(rnaseqMatrix1)[2]];
rownames(rnaseqMatrix0)=rnaseqMatrix1$Geneid;
col_ordering = c(1,2,11,12)#0h
rnaseqMatrix =rnaseqMatrix0[,col_ordering];rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,];conditions = factor(c("WT","WT","KO","KO"));
exp_study = DGEList(counts=rnaseqMatrix, group=conditions);exp_study = calcNormFactors(exp_study);exp_study = estimateCommonDisp(exp_study);exp_study = estimateTagwiseDisp(exp_study);et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
r <- rownames(topTags(et)$table)
c=cpm(exp_study)[r, order(exp_study$samples$group)]
write.table(tTags, file="edgeRgenes_featurecount-0h.csv")

col_ordering = c(3,4,13,14)#2h
rnaseqMatrix =rnaseqMatrix0[,col_ordering];rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,];conditions = factor(c("WT","WT","KO","KO"));
exp_study = DGEList(counts=rnaseqMatrix, group=conditions);exp_study = calcNormFactors(exp_study);exp_study = estimateCommonDisp(exp_study);exp_study = estimateTagwiseDisp(exp_study);et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
write.table(tTags, file="edgeRgenes_featurecount-2h.csv")

col_ordering = c(5,6,15,16)#6h
rnaseqMatrix =rnaseqMatrix0[,col_ordering];rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,];conditions = factor(c("WT","WT","KO","KO"));
exp_study = DGEList(counts=rnaseqMatrix, group=conditions);exp_study = calcNormFactors(exp_study);exp_study = estimateCommonDisp(exp_study);exp_study = estimateTagwiseDisp(exp_study);et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
write.table(tTags, file="edgeRgenes_featurecount-6h.csv")


col_ordering = c(7,8,17,18)#IFN
rnaseqMatrix =rnaseqMatrix0[,col_ordering];rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,];conditions = factor(c("WT","WT","KO","KO"));
exp_study = DGEList(counts=rnaseqMatrix, group=conditions);exp_study = calcNormFactors(exp_study);exp_study = estimateCommonDisp(exp_study);exp_study = estimateTagwiseDisp(exp_study);et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
write.table(tTags, file="edgeRgenes_featurecount-IFN.csv")

col_ordering = c(9,10,19,20)#IL4
rnaseqMatrix =rnaseqMatrix0[,col_ordering];rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,];conditions = factor(c("WT","WT","KO","KO"));
exp_study = DGEList(counts=rnaseqMatrix, group=conditions);exp_study = calcNormFactors(exp_study);exp_study = estimateCommonDisp(exp_study);exp_study = estimateTagwiseDisp(exp_study);et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
write.table(tTags, file="edgeRgenes_featurecount-IL4.csv")



python -m HTSeq.scripts.count [options] <alignment_files> <gff_file>
htseq-count  --stranded=no   /data/images/proton/DKlab/orsalia/sortedwt01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt02pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt21pos.bam  /data/images/proton/DKlab/orsalia/sortedwt22pos.bam  /data/images/proton/DKlab/orsalia/sortedwt61pos.bam  /data/images/proton/DKlab/orsalia/sortedwt62pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN1pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN2pos.bam  /data/images/proton/DKlab/orsalia/sortedwt01IL4pos.bam /data/images/proton/DKlab/orsalia/sortedwt02IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedKo02pos.sorted.bam   /data/images/proton/DKlab/orsalia/sortedKO21pos.bam  /data/images/proton/DKlab/orsalia/sortedKO22pos.bam  /data/images/proton/DKlab/orsalia/sortedKo61pos.bam  /data/images/proton/DKlab/orsalia/sortedKo62pos.bam  /data/images/proton/DKlab/orsalia/sortedKOIFN1pos.bam /data/images/proton/DKlab/orsalia/sortedKOIFN2pos.bam /data/images/proton/DKlab/orsalia/sortedKo1IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo2IL4pos.bam  /data/images/proton/DKlab/orsalia2/stringtie/merged.gtf3  > gene_counts_merged3.txt

#@
/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts -O --fraction -M -s 1 -J -f -g gene_name -t exon -T 4 -a /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf -o  wt01.cnt  wt01.bam wt02.bam WT_0h_replicate_I.bam WT_0h_replicate_II_.bam
  -t <string>         Specify feature type in GTF annotation. 'exon' by 
                      default. Features used for read counting will be 
                      extracted from annotation using the provided value.

  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by 
                      default. Meta-features used for read counting will be 
                      extracted from annotation using the provided value.
  -O                  Assign reads to all their overlapping meta-features (or 
                      features if -f is specified).
  -M                  Multi-mapping reads will also be counted. For a multi-
                      mapping read, all its reported alignments will be 
                      counted. The 'NH' tag in BAM/SAM input is used to detect 
                      multi-mapping reads.
  -J                  Count number of reads supporting each exon-exon junction.
                      Junctions were identified from those exon-spanning reads
                      in the input (containing 'N' in CIGAR string). Counting
                      results are saved to a file named '<output_file>.jcounts'

# Fractional counting

  --fraction          Assign fractional counts to features. This option must
                      be used together with '-M' or '-O' or both. When '-M' is
                      specified, each reported alignment from a multi-mapping
                      read (identified via 'NH' tag) will carry a fractional
                      count of 1/x, instead of 1 (one), where x is the total
                      number of alignments reported for the same read. When '-O'
                      is specified, each overlapping feature will receive a
                      fractional count of 1/y, where y is the total number of
                      features overlapping with the read. When both '-M' and
                      '-O' are specified, each alignment will carry a fractional
                      count of 1/(x*y).

  -s <int>            Perform strand-specific read counting. Acceptable values:
                      0 (unstranded), 1 (stranded) and 2 (reversely stranded).
                      0 by default.

		      
 -M --fraction not for edgeR
		      
		      

/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts --help
/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts: unrecognized option '--help'

Version 1.5.2

Usage: featureCounts [options] -a <annotation_file> -o <output_file> input_file1 [input_file2] ... 

## Mandatory arguments:

  -a <string>         Name of an annotation file. GTF/GFF format by default.
                      See -F option for more format information. Inbuilt
                      annotations (SAF format) is available in 'annotation'
                      directory of the package.

  -o <string>         Name of the output file including read counts. A separate
                      file including summary statistics of counting results is
                      also included in the output ('<string>.summary')

  input_file1 [input_file2] ...   A list of SAM or BAM format files. They can be
                      either name or location sorted. If not files provided,
                      <stdin> input is expected.

## Optional arguments:
# Annotation

  -F <string>         Specify format of the provided annotation file. Acceptable
                      formats include 'GTF' (or compatible GFF format) and
                      'SAF'. 'GTF' by default.  For SAF format, please refer to
                      Users Guide.

  -t <string>         Specify feature type in GTF annotation. 'exon' by 
                      default. Features used for read counting will be 
                      extracted from annotation using the provided value.

  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by 
                      default. Meta-features used for read counting will be 
                      extracted from annotation using the provided value.

  -A <string>         Provide a chromosome name alias file to match chr names in
                      annotation with those in the reads. This should be a two-
                      column comma-delimited text file. Its first column should
                      include chr names in the annotation and its second column
                      should include chr names in the reads. Chr names are case
                      sensitive. No column header should be included in the
                      file.

# Level of summarization

  -f                  Perform read counting at feature level (eg. counting 
                      reads for exons rather than genes).

# Overlap between reads and features

  -O                  Assign reads to all their overlapping meta-features (or 
                      features if -f is specified).

  --minOverlap <int>  Minimum number of overlapping bases in a read that is
                      required for read assignment. 1 by default. Number of
                      overlapping bases is counted from both reads if paired
                      end. If a negative value is provided, then a gap of up
                      to specified size will be allowed between read and the
                      feature that the read is assigned to.

  --fracOverlap <float> Minimum fraction of overlapping bases in a read that is
                      required for read assignment. Value should be within range
                      [0,1]. 0 by default. Number of overlapping bases is
                      counted from both reads if paired end. Both this option
                      and '--minOverlap' option need to be satisfied for read
                      assignment.

  --largestOverlap    Assign reads to a meta-feature/feature that has the 
                      largest number of overlapping bases.

  --readExtension5 <int> Reads are extended upstream by <int> bases from their
                      5' end.

  --readExtension3 <int> Reads are extended upstream by <int> bases from their
                      3' end.

  --read2pos <5:3>    Reduce reads to their 5' most base or 3' most base. Read
                      counting is then performed based on the single base the 
                      read is reduced to.

# Multi-mapping reads

  -M                  Multi-mapping reads will also be counted. For a multi-
                      mapping read, all its reported alignments will be 
                      counted. The 'NH' tag in BAM/SAM input is used to detect 
                      multi-mapping reads.

# Fractional counting

  --fraction          Assign fractional counts to features. This option must
                      be used together with '-M' or '-O' or both. When '-M' is
                      specified, each reported alignment from a multi-mapping
                      read (identified via 'NH' tag) will carry a fractional
                      count of 1/x, instead of 1 (one), where x is the total
                      number of alignments reported for the same read. When '-O'
                      is specified, each overlapping feature will receive a
                      fractional count of 1/y, where y is the total number of
                      features overlapping with the read. When both '-M' and
                      '-O' are specified, each alignment will carry a fractional
                      count of 1/(x*y).

# Read filtering

  -Q <int>            The minimum mapping quality score a read must satisfy in
                      order to be counted. For paired-end reads, at least one
                      end should satisfy this criteria. 0 by default.

  --splitOnly         Count split alignments only (ie. alignments with CIGAR
                      string containing 'N'). An example of split alignments is
                      exon-spanning reads in RNA-seq data.

  --nonSplitOnly      If specified, only non-split alignments (CIGAR strings do
                      not contain letter 'N') will be counted. All the other
                      alignments will be ignored.

  --primary           Count primary alignments only. Primary alignments are 
                      identified using bit 0x100 in SAM/BAM FLAG field.

  --ignoreDup         Ignore duplicate reads in read counting. Duplicate reads 
                      are identified using bit Ox400 in BAM/SAM FLAG field. The 
                      whole read pair is ignored if one of the reads is a 
                      duplicate read for paired end data.

# Strandness

  -s <int>            Perform strand-specific read counting. Acceptable values:
                      0 (unstranded), 1 (stranded) and 2 (reversely stranded).
                      0 by default.

# Exon-exon junctions

  -J                  Count number of reads supporting each exon-exon junction.
                      Junctions were identified from those exon-spanning reads
                      in the input (containing 'N' in CIGAR string). Counting
                      results are saved to a file named '<output_file>.jcounts'

  -G <string>         Provide the name of a FASTA-format file that contains the
                      reference sequences used in read mapping that produced the
                      provided SAM/BAM files. This optional argument can be used
                      with '-J' option to improve read counting for junctions.

# Parameters specific to paired end reads

  -p                  If specified, fragments (or templates) will be counted
                      instead of reads. This option is only applicable for
                      paired-end reads.

  -B                  Only count read pairs that have both ends aligned.

  -P                  Check validity of paired-end distance when counting read 
                      pairs. Use -d and -D to set thresholds.

  -d <int>            Minimum fragment/template length, 50 by default.

  -D <int>            Maximum fragment/template length, 600 by default.

  -C                  Do not count read pairs that have their two ends mapping 
                      to different chromosomes or mapping to same chromosome 
                      but on different strands.

  --donotsort         Do not sort reads in BAM/SAM input. Note that reads from 
                      the same pair are required to be located next to each 
                      other in the input.

# Number of CPU threads

  -T <int>            Number of the threads. 1 by default.

# Miscellaneous

  -R                  Output detailed assignment result for each read. A text 
                      file will be generated for each input file, including 
                      names of reads and meta-features/features reads were 
                      assigned to. See Users Guide for more details.

  --tmpDir <string>   Directory under which intermediate files are saved (later
                      removed). By default, intermediate files will be saved to
                      the directory specified in '-o' argument.

  --maxMOp <int>      Maximum number of 'M' operations allowed in a CIGAR
                      string. 10 by default. Both 'X' and '=' are treated as 'M'
                      and adjacent 'M' operations are merged in the CIGAR
                      string.

  -v                  Output version of the program.



/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts -O -g gene_name -t exon -T 8 -a /data/images/proton/DKlab/orsalia2/stringtie/merged.gtf3 -o gene_counts_merged3.txt /data/images/proton/DKlab/orsalia/sortedwt01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt02pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedwt21pos.bam  /data/images/proton/DKlab/orsalia/sortedwt22pos.bam  /data/images/proton/DKlab/orsalia/sortedwt61pos.bam  /data/images/proton/DKlab/orsalia/sortedwt62pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN1pos.bam  /data/images/proton/DKlab/orsalia/sortedwtIFN2pos.bam  /data/images/proton/DKlab/orsalia/sortedwt01IL4pos.bam /data/images/proton/DKlab/orsalia/sortedwt02IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo01pos.sorted.bam  /data/images/proton/DKlab/orsalia/sortedKo02pos.sorted.bam   /data/images/proton/DKlab/orsalia/sortedKO21pos.bam  /data/images/proton/DKlab/orsalia/sortedKO22pos.bam  /data/images/proton/DKlab/orsalia/sortedKo61pos.bam  /data/images/proton/DKlab/orsalia/sortedKo62pos.bam  /data/images/proton/DKlab/orsalia/sortedKOIFN1pos.bam /data/images/proton/DKlab/orsalia/sortedKOIFN2pos.bam /data/images/proton/DKlab/orsalia/sortedKo1IL4pos.bam /data/images/proton/DKlab/orsalia/sortedKo2IL4pos.bam

  -p                  If specified, fragments (or templates) will be counted
                      instead of reads. This option is only applicable for
                      paired-end reads.



library(edgeR)

data = read.table("/data/images/proton/run125/www/genes.counts.matrix", header=T, row.names=1, com='')
col_ordering = c(1,2,6,7)
rnaseqMatrix = data[,col_ordering]
rnaseqMatrix = round(rnaseqMatrix)
rnaseqMatrix = rnaseqMatrix[rowSums(rnaseqMatrix)>=2,]
conditions = factor(c(rep("FAGsA", 2), rep("FAGsB", 2)))

exp_study = DGEList(counts=rnaseqMatrix, group=conditions)
exp_study = calcNormFactors(exp_study)
exp_study = estimateCommonDisp(exp_study)
exp_study = estimateTagwiseDisp(exp_study)
et = exactTest(exp_study)
tTags = topTags(et,n=NULL,adjust.method = "fdr")
r = rownames(tTags)
c=data[r,col_ordering]

# Do a volcano plot
pdf("genes.counts.matrix.FAGsA_vs_FAGsB.edgeR.DE_results.MA_n_Volcano2.pdf", w=11, h=8.5)
plot(tTags$table$logFC, -log(tTags$table$PValue,10), main="Volcano plot of raw p-values", 
  pch=20, xlab="log2 ( Group B / Group A )", ylab="-log10 (raw p-value)")
#plot(tTags$table$logFC, -log(tTags$table$FDR,10), main="Volcano plot of FDR-adjusted p-values", 
#  pch=20, xlab="log2 ( Group B / Group A )", ylab="-log10 (FDR p-value)")
#plot(tTags$table$logFC, -log(tTags$table$PValue,10), main="Volcano plot showing genes with a raw p-value < 1e-05", 
#  pch=20, xlab="log2 ( Group B / Group A )", ylab="-log10 (raw p-value)", ylim=c(0,5))
#plot(tTags$table$logFC, -log(tTags$table$FDR,10), main="Volcano plot showing genes with a FDR p-value < 1e-05", 
#  pch=20, xlab="log2 ( Group B / Group A )", ylab="-log10 (FDR p-value)", ylim=c(0,5))
dev.off()

http://bioinf.wehi.edu.au/RNAseqCaseStudy/
library(Rsubread)
library(limma)
library(edgeR)
options(digits=2)
targets <- readTargets()
celltype <- factor(targets$CellType)
design <- model.matrix(~celltype)
targets
   CellType  InputFile OutputFile
1        A A_1.txt.gz    A_1.bam
2        A A_2.txt.gz    A_2.bam
3        B B_1.txt.gz    B_1.bam
4        B B_2.txt.gz    B_2.bam


x <- DGEList(counts=fc$counts, genes=fc$annotation[,c("GeneID","Length")])
#If you need RPKM values for genes, you can use the following command to generate them:

x_rpkm <- rpkm(x,x$genes$Length)
#Filtering. Only keep in the analysis those genes which had >10 reads per million mapped reads in at least two libraries.

isexpr <- rowSums(cpm(x) > 10) >= 2
x <- x[isexpr,]
#Normalization. Perform voom normalization:

y <- voom(x,design,plot=TRUE)

plotMDS(y,xlim=c(-2.5,2.5))
fit <- eBayes(lmFit(y,design))
topTable(fit,coef=2)

#
http://genomicsclass.github.io/book/pages/rnaseq_gene_level.html
library(pheatmap)
topgenes <- head(rownames(resSort),20)
mat <- assay(rld)[topgenes,]
mat <- mat - rowMeans(mat)
df <- as.data.frame(colData(dds)[,c("dex","cell")])
pheatmap(mat, annotation_col=df)

https://www.biostars.org/p/110861/
EM = EM # Imagine matrix of counts here...

conditions = c("con1", "con1", "con1", "con2", "con2", "con2", "con3", "con3", "con3", "con3")

batches = c("batch1", "batch2", "batch3", "batch1", "batch2", "batch3", "batch1", "con2", "con3", "con3")

dge = DGEList(counts=EM) # Create object

dge = calcNormFactors(dge, method='TMM') # Normalize library sizes using TMM

design = model.matrix(~0+conditions+batches) # Create design matrix for glm

colnames(design) = c(levels(conditions), levels(batches)[2:length(levels(batches))]) # Set prettier column names

dge = estimateGLMCommonDisp(dge, design) # Estimate common dispersion

dge = estimateGLMTagwiseDisp(dge, design) # Estimate tagwise dispersion

fit = glmFit(dge,design) # Fit glm

pair_vector = sprintf("%s-%s", "con1", "con3") # Samples to be compared
pair_contrast = makeContrasts(contrasts=pair_vector, levels=design) # Make contrast

lrt = glmLRT(fit, contrast=pair_contrast) # Likelihood ratio test

#multiple pairwise comp
https://support.bioconductor.org/p/9228/
To compute a design matrix to make all possible pairwise comparisons,
it is easier to do this
directly.  Here is a little function which does the job:

> design.pairs <-
function(levels) {
n <- length(levels)
design <- matrix(0,n,choose(n,2))
rownames(design) <- levels
colnames(design) <- 1:choose(n,2)
k <- 0
for (i in 1:(n-1))
for (j in (i+1):n) {
k <- k+1
design[i,k] <- 1
design[j,k] <- -1
colnames(design)[k] <- paste(levels[i],"-",levels[j],sep="")
}
design
}
> design.pairs(c("A","B","C","D"))
  A-B A-C A-D B-C B-D C-D
A   1   1   1   0   0   0
B  -1   0   0   1   1   0
C   0  -1   0  -1   0   1
D   0   0  -1   0  -1  -1

Gordon
		      
