#raw reads in:
/data/images/proton/DKlab/mr/Polysomes/reads/Backup_Polysomes_TTP/

#adapter removal, also maps machine file id to sample
#all samples:
/data/images/proton/DKlab/mr/Polysomes/reads/cutadapt-calls.sh
#one sample: 
This is cutadapt 1.9.1 with Python 2.7.9
Command line parameters: -q 20 -z -m 15 -c -a CGCCTTGGCCGTACAGCAG ../Backup_Polysomes_TTP/ugc_623_BC_FRAG_W_Paulina_150506_L03_ugc_623_20_F3.csfasta /data/images/proton/DKlab/polysomes/reads/Backup_Polysomes_TTP/ugc_623_BC_FRAG_W_Paulina_150506_L03_ugc_623_20_F3.QV.qual
# fastq in /data/images/proton/DKlab/mr/Polysomes/reads/fastq/
    -z, --zero-cap      Change negative quality values to zero. This is enabled by default when -c/--colorspace is also
                        enabled. Use the above option to disable it.
    -m LENGTH, --minimum-length=LENGTH
                        Discard trimmed reads that are shorter than LENGTH. Reads that are too short even before adapter
                        removal are also discarded. In colorspace, an initial primer is not counted (default: 0).
    -q [5'CUTOFF,]3'CUTOFF, --quality-cutoff=[5'CUTOFF,]3'CUTOFF
                        Trim low-quality bases from 5' and/or 3' ends of reads before adapter removal. If one value is
                        given, only the 3' end is trimmed. If two comma-separated cutoffs are given, the 5' end is trimmed
                        with the first cutoff, the 3' end with the second. See documentation for the algorithm. (default:
                        no trimming)
    -a ADAPTER, --adapter=ADAPTER
                        Sequence of an adapter that was ligated to the 3' end. The adapter itself and anything that
                        follows is trimmed. If the adapter sequence ends with the '$' character, the adapter is anchored
                        to the end of the read and only found if it is a suffix of the read.
    -c, --colorspace    Enable colorspace mode: Also trim the color that is adjacent to the found adapter.

# adapter and  qual-trimming stats in
/data/images/proton/DKlab/mr/Polysomes/README-nreads.txt


#alignment
#one:
/data/results/tools/align/tophat-1.4.0.Linux_x86_64/tophat --transcriptome-only -p 10 -n 2 -o KO_0h_replicate_Ib -G /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf -C  /data/results/reference/mmu/mm9/bowtie1/mm9c ../fastq/KO_0h_replicate_I.fastq
#more:
/data/images/proton/DKlab/mr/Polysomes/reads/tophat/tophat-calls.sh1

    -T/--transcriptome-only                    (map only to the transcriptome)
    -n/--transcriptome-mismatches  <int>       [ default: 1                ]
    -G/--GTF                       <filename>  (GTF/GFF with known transcripts)
    -C/--color                                 (Solid - color space)
#default settings, not modified:
-g/--max-multihits             <int>       [ default: 20               ]
-x/--transcriptome-max-hits    <int>       [ default: 60               ]


# quantitation
# A) per-transcript
/data/images/proton/DKlab/mr/Polysomes/reads/stringtie/stringtie-calls-transcriptome.sh
#one sample:
/data/results/tools/rnaseq/stringtie/stringtie-1.3.3b.Linux_x86_64/stringtie ../tophat/KO_0h_replicate_Ib/accepted_hits.bam -p 10 -o  ballgown/KO_0h_replicate_I/KO_0h_replicate_I.gtf -A ballgown/KO_0h_replicate_I/KO_0h_replicate_I.tab -B -G /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf &> KO_0h_replicate_I.log
 -B enable output of Ballgown table files which will be created in the
    same directory as the output GTF (requires -G, -o recommended)

# merge transcripts per sample into common gtf
/data/results/tools/rnaseq/stringtie/stringtie-1.3.3b.Linux_x86_64/stringtie --merge -o merged.gtf -G /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf ballgown/KO_0h_replicate_II/KO_0h_replicate_II.gtf ballgown/KO_0h_replicate_I/KO_0h_replicate_I.gtf ballgown/KO_2h_replicate_II/KO_2h_replicate_II.gtf ballgown/KO_2h_replicate_I/KO_2h_replicate_I.gtf ballgown/KO_6h_replicate_II/KO_6h_replicate_II.gtf ballgown/KO_6h_replicate_I/KO_6h_replicate_I.gtf ballgown/KO_IFN-gamma_replicate_II/KO_IFN-gamma_replicate_II.gtf ballgown/KO_IFN_replicate_I/KO_IFN_replicate_I.gtf ballgown/KO_IL4_replicate_II/KO_IL4_replicate_II.gtf ballgown/KO_IL4_replicate_I/KO_IL4_replicate_I.gtf ballgown/WT_0h_replicate_II/WT_0h_replicate_II.gtf ballgown/WT_0h_replicate_I/WT_0h_replicate_I.gtf ballgown/WT_2h_replicate_II/WT_2h_replicate_II.gtf ballgown/WT_2h_replicate_I/WT_2h_replicate_I.gtf ballgown/WT_6h_replicate_II/WT_6h_replicate_II.gtf ballgown/WT_6h_replicate_I/WT_6h_replicate_I.gtf ballgown/WT_IFN-gamma_replicate_II/WT_IFN-gamma_replicate_II.gtf ballgown/WT_IFN-gamma_replicate_I/WT_IFN-gamma_replicate_I.gtf ballgown/WT_IL4_replicate_II/WT_IL4_replicate_II.gtf ballgown/WT_IL4_replicate_I/WT_IL4_replicate_I.gtf

# quantify again with merged.gtf
/data/results/tools/rnaseq/stringtie/stringtie-1.3.3b.Linux_x86_64/stringtie ../tophat/KO_0h_replicate_Ib/accepted_hits.bam -p 10 -o  ballgown/KO_0h_replicate_I/KO_0h_replicate_I.gtf -e -B  -G merged.gtf &> KO_0h_replicate_I.log
 -e only estimate the abundance of given reference transcripts (requires -G)
# results are collected in folder ballgown (needed for prepDE.py, see below)

# generate count matrix for edgeR
/data/results/tools/rnaseq/stringtie/stringtie-master/prepDE.py
result in transcript_count_matrix.csv

# diff. exp.
Rscript  edgeR2.r
rnaseqMatrix0= read.table("transcript_count_matrix.csv",header=T,sep=",")
..
#1st pairing
col_ordering = c(11,12,1,2)#0h
rnaseqMatrix =rnaseqMatrix0[,col_ordering];
# keep transcripts that have at least 5 total reads in one condition
rnaseqMatrix = rnaseqMatrix[(rowSums(rnaseqMatrix[,1:2])>=5)|(rowSums(rnaseqMatrix[,3:4])>=5),];
..
write.table(tTags, file="edgeRtranscripts2-0h.csv")
#with expression per sample:
write.table(x, file="edgeRtranscripts2Expr-0h.csv")


ls edgeRtranscripts2-?h.csv > edgeR-results2.txt
ls edgeRtranscripts2-???.csv >> edgeR-results2.txt
#awk -v f="/data/images/proton/DKlab/mr/Polysomes/reads/stringtie/merged.gtf" -f addGeneToEdger.awk
# add  columns "geneName geneID transcriptID " before results
cat edgeR-results2.txt | awk -v f="/data/images/proton/DKlab/mr/Polysomes/reads/stringtie/merged.gtf" -f addGeneToEdger.awk

# quantitation
# B) per-gene

/data/images/proton/DKlab/mr/Polysomes/reads/stringtie/per-gene/README-polysomeSeq-per-gene.txt
/data/results/tools/rnaseq/subread/subread-1.5.2-source/bin/featureCounts -O -g gene_name -t exon -T 8 -a ~/bak/doc/fleming/kafasla/DKlab/mr/Polysomes/reads/stringtie/merged.gtf -o gene_counts_merged3_featurecounts.txt ../../tophat/KO_0h_replicate_I/accepted_hits.bam ../../tophat/KO_0h_replicate_II/accepted_hits.bam ../../tophat/KO_2h_replicate_I/accepted_hits.bam ../../tophat/KO_2h_replicate_II/accepted_hits.bam ../../tophat/KO_6h_replicate_I/accepted_hits.bam ../../tophat/KO_6h_replicate_II/accepted_hits.bam ../../tophat/KO_IFN-gamma_replicate_II/accepted_hits.bam ../../tophat/KO_IFN_replicate_I/accepted_hits.bam ../../tophat/KO_IL4_replicate_I/accepted_hits.bam ../../tophat/KO_IL4_replicate_II/accepted_hits.bam ../../tophat/WT_0h_replicate_I/accepted_hits.bam ../../tophat/WT_0h_replicate_II/accepted_hits.bam ../../tophat/WT_2h_replicate_I/accepted_hits.bam ../../tophat/WT_2h_replicate_II/accepted_hits.bam ../../tophat/WT_6h_replicate_I/accepted_hits.bam ../../tophat/WT_6h_replicate_II/accepted_hits.bam ../../tophat/WT_IFN-gamma_replicate_I/accepted_hits.bam ../../tophat/WT_IFN-gamma_replicate_II/accepted_hits.bam ../../tophat/WT_IL4_replicate_I/accepted_hits.bam ../../tophat/WT_IL4_replicate_II/accepted_hits.bam

  -O                  Assign reads to all their overlapping meta-features (or 
                      features if -f is specified).
  -g <string>         Specify attribute type in GTF annotation. 'gene_id' by 
                      default. Meta-features used for read counting will be 
                      extracted from annotation using the provided value.
  -t <string>         Specify feature type in GTF annotation. 'exon' by 
                      default. Features used for read counting will be 
                      extracted from annotation using the provided value.
  -T <int>            Number of the threads. 1 by default.
  -a <string>         Name of an annotation file. GTF/GFF format by default.
                      See -F option for more format information. Inbuilt
                      annotations (SAF format) is available in 'annotation'
                      directory of the package.

# Rscript for edgeR in /data/images/proton/DKlab/mr/Polysomes/reads/stringtie/per-gene/README-polysomeSeq-per-gene.txt
rnaseqMatrix1= read.table("gene_counts_merged3_featurecounts.txt",header=T,sep="\t",skip=1)
..
write.table(tTags, file="edgeRgenes_featurecount-0h.csv")
write.table(x, file="edgeRgenes_featurecountExpr-0h.csv")