/data/images/proton/DKlab/mr/rnaseq/cutadapt.sh /data/images/proton/DKlab/mr/rnaseq/hisat.sh #@ /data/results/tools/rnaseq/hisat/hisat2-2.1.0/hisat2-build -p 9 /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa mm9_hisat2 /data/results/tools/rnaseq/hisat/hisat2-2.1.0/hisat2 -p 8 --dta -x mm9_hisat -1 /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz -2 /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R2.fastq.gz -S s.sam #hisat untrimmed time /data/results/tools/rnaseq/hisat/hisat2-2.1.0/hisat2 -p 8 --dta -x mm9_hisat2 -1 /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz -2 /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R2.fastq.gz | /data/results/tools/samtools/samtools-1.3/samtools sort -@ 8 -o wt0h1.bam 42154307 reads; of these: 42154307 (100.00%) were paired; of these: 2343038 (5.56%) aligned concordantly 0 times 37587574 (89.17%) aligned concordantly exactly 1 time 2223695 (5.28%) aligned concordantly >1 times ---- 2343038 pairs aligned concordantly 0 times; of these: 533599 (22.77%) aligned discordantly 1 time ---- 1809439 pairs aligned 0 times concordantly or discordantly; of these: 3618878 mates make up the pairs; of these: 2300264 (63.56%) aligned 0 times 1162316 (32.12%) aligned exactly 1 time 156298 (4.32%) aligned >1 times 97.27% overall alignment rate [bam_sort_core] merging from 48 files... real 18m0.596s user 118m1.836s sys 12m56.824s --dta reports alignments tailored for transcript assemblers -k (default: 5) report up to alns per read -rw-r--r-- 1 reczko users 4.2G Jun 13 2017 wt0h1.bam ]0;/data/images/proton/DKlab/mr/rnaseqreczko@max:/data/images/proton/DKlab/mr/rnaseq$ ls -lh /data/images/proton/DKlab/orsalia/sortedwt01pos.bam -rw-rw-r-- 1 reczko users 6.2G Sep 10 2014 /data/images/proton/DKlab/orsalia/sortedwt01pos.bam ]0;/data/images/proton/DKlab/mr/rnaseqreczko@max:/data/images/proton/DKlab/mr/rnaseq$ samtools flagstat !$ samtools flagstat /data/images/proton/DKlab/orsalia/sortedwt01pos.bam 78892142 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 78892142 + 0 mapped (100.00%:-nan%) 78892142 + 0 paired in sequencing 39446071 + 0 read1 39446071 + 0 read2 75557584 + 0 properly paired (95.77%:-nan%) 78892142 + 0 with itself and mate mapped 0 + 0 singletons (0.00%:-nan%) 0 + 0 with mate mapped to a different chr 0 + 0 with mate mapped to a different chr (mapQ>=5) ]0;/data/images/proton/DKlab/mr/rnaseqreczko@max:/data/images/proton/DKlab/mr/rnaseq$ samtools flagstat wt0h1.bam 91588434 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 89288170 + 0 mapped (97.49%:-nan%) 91588434 + 0 paired in sequencing 45807114 + 0 read1 45781320 + 0 read2 86638218 + 0 properly paired (94.60%:-nan%) 88235258 + 0 with itself and mate mapped 1052912 + 0 singletons (1.15%:-nan%) 280381 + 0 with mate mapped to a different chr 106869 + 0 with mate mapped to a different chr (mapQ>=5) /data/results/tools/proton/tophat2stranded-sampled-bigwig1.sh wt0h1.bam /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa.fai 1 #guess adapter /data/results/tools/adapter/minion/minion search-adapter -i /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz ]0;/data/images/proton/DKlab/mr/rnaseqreczko@max:/data/images/proton/DKlab/mr/rnaseq$ /data/results/tools/adapter/minion/minion search-adapter -i /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz [minion] reading reads ................................................. 1 ................................................. 2 [minion] connected component analysis [minion] building consensus sequences criterion=sequence-density sequence-density=1.09 sequence-density-rank=1 fanout-score=57.60 fanout-score-rank=1 prefix-density=1.53 prefix-fanout=41.3 sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCT criterion=fanout-score sequence-density=1.09 sequence-density-rank=1 fanout-score=57.60 fanout-score-rank=1 prefix-density=1.53 prefix-fanout=41.3 sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCT /data/results/tools/adapter/minion/minion search-adapter -i /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R2.fastq.gz [minion] reading reads ................................................. 1 ................................................. 2 [minion] connected component analysis [minion] building consensus sequences criterion=sequence-density sequence-density=1.08 sequence-density-rank=1 fanout-score=39.68 fanout-score-rank=1 prefix-density=1.49 prefix-fanout=28.7 sequence=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT criterion=fanout-score sequence-density=1.08 sequence-density-rank=1 fanout-score=39.68 fanout-score-rank=1 prefix-density=1.49 prefix-fanout=28.7 sequence=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT /data/results/tools/adapter/minion/minion search-adapter -i /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz sequence=AGATCGGAAGAGCACACGTCTGAACTCCAGTCACTAGCTTATCTCGTATGCCGTCT -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ #cutadapt-doc /data/results/tools/adapter/minion/minion search-adapter -i /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R2.fastq.gz sequence=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ #cutadapt-doc AGATCGGAAGAGC http://seqanswers.com/forums/archive/index.php/t-17168.html fkrueger 01-27-2012, 05:29 AM We find that using the first 13bp of the Illumina adapter ('AGATCGGAAGAGC') efficiently removes adapter contamination for both paired-end files (the adapters on both sides share this sequence before they fork, and any of the Illumina multiplex barcodes should be further downstream of that). A typical command for Cutadapt could be ./cutadapt -f fastq -O $stringency -q 20 -a AGATCGGAAGAGC input_file.fastq $stringency would define the overlap with the adapter required for it to remove sequence from the end, the default is 3 I believe. This command would remove poor quality sequence as well as adapters from your FastQ file. You should only be careful with the option of removing sequences if they become too short, because this can throw off the sequence-by-sequence order of paired-end files which is required by many aligners. I hope this helps pip install --user --upgrade cutadapt https://cutadapt.readthedocs.io/en/stable/guide.html#illumina-truseq f you have paired-end data, trim also read 2 with the reverse complement of the “TruSeq Universal Adapter”. The full command-line looks as follows: cutadapt \ -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC \ -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT \ -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ reads.1.fastq.gz reads.2.fastq.gz See also the section about paired-end adapter trimming above. If you want to simplify this a bit, you can also use the common prefix AGATCGGAAGAGC as the adapter sequence in both cases. However, you should be aware that this sequence occurs multiple times in the human genome and it could therefore skew your results very slightly at those loci cutadapt \ -a AGATCGGAAGAGC -A AGATCGGAAGAGC \ -o trimmed.1.fastq.gz -p trimmed.2.fastq.gz \ reads.1.fastq.gz reads.2.fastq.gz # ~/.local/bin/cutadapt -j 16 -a AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC -A AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT -o wt01_R1_trm.fastq.gz -p wt01_R2_trm.fastq.gz /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R1.fastq.gz /mnt/max/b/genomics_facility/DKlab/mr/rnaseq/RNA_Seq/HI.1370.002.Index_10.wt01_R2.fastq.gz Processing reads on 1 core in paired-end mode ... Finished in 1220.48 s (29 us/read; 2.07 M reads/minute). === Summary === Total read pairs processed: 42,154,307 Read 1 with adapter: 2,453,511 (5.8%) Read 2 with adapter: 2,620,834 (6.2%) Pairs written (passing filters): 42,154,307 (100.0%) Total basepairs processed: 8,430,861,400 bp Read 1: 4,215,430,700 bp Read 2: 4,215,430,700 bp Total written (filtered): 8,390,705,885 bp (99.5%) Read 1: 4,195,579,877 bp Read 2: 4,195,126,008 bp === First read: Adapter 1 === Sequence: AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC; Type: regular 3'; Length: 34; Trimmed: 2453511 times. No. of allowed errors: 0-9 bp: 0; 10-19 bp: 1; 20-29 bp: 2; 30-34 bp: 3 Bases preceding removed adapters: A: 17.8% C: 31.8% G: 34.4% T: 15.9% none/other: 0.0% Overview of removed sequences length count expect max.err error counts 3 951877 658661.0 0 951877 4 289836 164665.3 0 289836 5 151608 41166.3 0 151608 6 107252 10291.6 0 107252 7 93774 2572.9 0 93774 8 86187 643.2 0 86187 9 79069 160.8 0 78188 881 10 73180 40.2 1 70601 2579 11 65718 10.1 1 63986 1732 12 60438 2.5 1 58567 1871 13 54836 0.6 1 53164 1672 14 49785 0.2 1 48478 1307 15 45370 0.0 1 44057 1313 16 39667 0.0 1 38443 1224 17 34940 0.0 1 34023 917 18 30880 0.0 1 29885 942 53 19 26888 0.0 1 25547 1252 89 20 24260 0.0 2 23271 863 126 21 21734 0.0 2 20889 749 96 22 19286 0.0 2 18334 852 100 23 17228 0.0 2 16570 566 92 24 15753 0.0 2 14982 673 98 25 13924 0.0 2 13303 540 81 26 12142 0.0 2 11482 574 86 27 11084 0.0 2 10566 455 58 5 28 9320 0.0 2 8781 445 74 20 29 8158 0.0 2 7718 376 40 24 30 7041 0.0 3 6634 314 57 36 31 6282 0.0 3 5799 403 59 21 32 5585 0.0 3 5261 270 45 9 33 4863 0.0 3 4588 230 35 10 34 4267 0.0 3 4025 199 29 14 35 3927 0.0 3 3714 181 24 8 36 3437 0.0 3 3245 167 17 8 37 2959 0.0 3 2794 135 22 8 38 2596 0.0 3 2458 124 8 6 39 2351 0.0 3 2214 112 19 6 40 2000 0.0 3 1881 105 11 3 41 1762 0.0 3 1669 84 6 3 42 1520 0.0 3 1454 56 6 4 43 1278 0.0 3 1213 56 3 6 44 1137 0.0 3 1075 55 6 1 45 1076 0.0 3 1013 57 3 3 46 916 0.0 3 868 42 5 1 47 815 0.0 3 767 41 5 2 48 700 0.0 3 666 25 5 4 49 661 0.0 3 624 32 3 2 50 568 0.0 3 539 27 2 51 457 0.0 3 438 16 1 2 52 398 0.0 3 379 17 1 1 53 329 0.0 3 307 22 54 314 0.0 3 303 7 2 2 55 278 0.0 3 264 12 2 56 221 0.0 3 206 12 3 57 201 0.0 3 193 5 2 1 58 195 0.0 3 186 8 0 1 59 134 0.0 3 127 7 60 139 0.0 3 130 9 61 109 0.0 3 104 3 1 1 62 98 0.0 3 92 5 1 63 69 0.0 3 68 1 64 56 0.0 3 51 4 0 1 65 39 0.0 3 37 2 66 40 0.0 3 37 3 67 32 0.0 3 29 3 68 31 0.0 3 29 2 69 18 0.0 3 18 70 13 0.0 3 13 71 12 0.0 3 11 1 72 9 0.0 3 8 1 73 10 0.0 3 10 74 2 0.0 3 2 75 3 0.0 3 3 76 2 0.0 3 2 77 3 0.0 3 3 78 3 0.0 3 3 79 7 0.0 3 7 80 9 0.0 3 9 81 9 0.0 3 8 0 0 1 82 7 0.0 3 7 83 4 0.0 3 4 84 5 0.0 3 5 85 7 0.0 3 7 86 14 0.0 3 12 0 0 2 87 10 0.0 3 9 1 88 9 0.0 3 9 89 8 0.0 3 8 90 18 0.0 3 18 91 9 0.0 3 9 92 11 0.0 3 11 93 8 0.0 3 7 0 1 94 11 0.0 3 11 95 10 0.0 3 9 0 1 96 4 0.0 3 4 97 9 0.0 3 9 98 6 0.0 3 5 0 0 1 99 4 0.0 3 3 1 100 182 0.0 3 6 164 12 === Second read: Adapter 2 === Sequence: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTAT; Type: regular 3'; Length: 54; Trimmed: 2620834 times. No. of allowed errors: 0-9 bp: 0; 10-19 bp: 1; 20-29 bp: 2; 30-39 bp: 3; 40-49 bp: 4; 50-54 bp: 5 Bases preceding removed adapters: A: 19.5% C: 33.7% G: 36.8% T: 10.0% none/other: 0.0% Overview of removed sequences length count expect max.err error counts 3 1088147 658661.0 0 1088147 4 315258 164665.3 0 315258 5 162649 41166.3 0 162649 6 107745 10291.6 0 107745 7 93863 2572.9 0 93863 8 85481 643.2 0 85481 9 78678 160.8 0 77530 1148 10 73385 40.2 1 70174 3211 11 65514 10.1 1 63131 2383 12 60106 2.5 1 58663 1443 13 54330 0.6 1 52798 1532 14 49363 0.2 1 48222 1141 15 44871 0.0 1 42638 2233 16 39302 0.0 1 38064 1238 17 34671 0.0 1 33669 1002 18 30527 0.0 1 28749 1768 10 19 26614 0.0 1 25678 882 54 20 24052 0.0 2 22682 1004 366 21 21538 0.0 2 20363 1000 175 22 19149 0.0 2 18261 781 107 23 17066 0.0 2 16277 672 117 24 15618 0.0 2 14863 671 84 25 13806 0.0 2 13115 621 70 26 12069 0.0 2 11351 639 79 27 10972 0.0 2 10264 595 111 2 28 9225 0.0 2 8600 526 93 6 29 8083 0.0 2 7514 491 67 11 30 6994 0.0 3 6531 376 64 23 31 6235 0.0 3 5101 947 101 86 32 5544 0.0 3 5052 302 69 121 33 4816 0.0 3 4370 338 74 34 34 4230 0.0 3 3848 286 62 34 35 3893 0.0 3 3567 265 44 17 36 3409 0.0 3 3118 238 37 16 37 2937 0.0 3 2710 188 27 12 38 2578 0.0 3 2338 186 35 19 39 2325 0.0 3 2112 169 26 15 3 40 1981 0.0 4 1805 128 30 8 10 41 1750 0.0 4 1615 104 20 8 3 42 1512 0.0 4 1385 102 14 8 3 43 1275 0.0 4 1120 121 21 5 8 44 1128 0.0 4 1034 62 16 11 5 45 1073 0.0 4 977 60 21 7 8 46 916 0.0 4 803 71 29 7 6 47 816 0.0 4 733 60 9 7 7 48 689 0.0 4 629 35 8 11 5 1 49 659 0.0 4 598 50 9 0 2 50 563 0.0 5 513 35 6 8 1 51 453 0.0 5 412 31 6 2 2 52 400 0.0 5 349 37 9 3 2 53 329 0.0 5 288 33 3 4 0 1 54 308 0.0 5 271 25 7 3 1 1 55 278 0.0 5 247 21 8 2 56 218 0.0 5 201 11 4 1 0 1 57 201 0.0 5 178 12 3 4 3 1 58 190 0.0 5 173 10 1 5 1 59 134 0.0 5 115 17 0 1 1 60 139 0.0 5 120 12 4 3 61 110 0.0 5 99 6 1 1 1 2 62 100 0.0 5 83 8 2 5 2 63 70 0.0 5 66 2 0 1 1 64 56 0.0 5 52 1 0 2 1 65 40 0.0 5 36 2 0 0 2 66 41 0.0 5 38 2 1 67 33 0.0 5 25 4 0 3 0 1 68 32 0.0 5 24 5 1 1 0 1 69 18 0.0 5 17 0 0 1 70 14 0.0 5 12 2 71 13 0.0 5 11 2 72 9 0.0 5 8 1 73 10 0.0 5 8 2 74 2 0.0 5 2 75 5 0.0 5 4 1 76 2 0.0 5 2 77 4 0.0 5 2 1 0 0 1 78 5 0.0 5 3 0 0 0 0 2 79 8 0.0 5 6 1 0 0 0 1 80 9 0.0 5 9 81 8 0.0 5 7 1 82 7 0.0 5 7 83 4 0.0 5 3 1 84 5 0.0 5 5 85 7 0.0 5 6 0 0 1 86 12 0.0 5 12 87 9 0.0 5 8 1 88 9 0.0 5 8 0 1 89 7 0.0 5 7 90 18 0.0 5 13 3 0 1 1 91 9 0.0 5 7 2 92 11 0.0 5 10 0 0 1 93 7 0.0 5 6 1 94 11 0.0 5 7 4 95 10 0.0 5 6 3 0 0 0 1 96 4 0.0 5 2 2 97 10 0.0 5 7 3 98 5 0.0 5 1 4 99 4 0.0 5 1 3 100 41 0.0 5 10 12 7 2 5 5 #hisat trimmed time /data/results/tools/rnaseq/hisat/hisat2-2.1.0/hisat2 -p 18 --dta -x mm9_hisat2 -1 wt01_R1_trm.fastq.gz -2 wt01_R2_trm.fastq.gz | /data/results/tools/samtools/samtools-1.3/samtools sort -@ 8 -o wt0h1_trm.bam Warning: skipping mate #1 of read 'HWI-ST521:209:C3DHAACXX:2:1101:3892:33181 1:N:0:TAGCTT' because length (0) <= # seed mismatches (0) Warning: skipping mate #1 of read 'HWI-ST521:209:C3DHAACXX:2:1101:3892:33181 1:N:0:TAGCTT' because it was < 2 characters long 42154307 reads; of these: 42154307 (100.00%) were paired; of these: 1557882 (3.70%) aligned concordantly 0 times 38304400 (90.87%) aligned concordantly exactly 1 time 2292025 (5.44%) aligned concordantly >1 times ---- 1557882 pairs aligned concordantly 0 times; of these: 129979 (8.34%) aligned discordantly 1 time ---- 1427903 pairs aligned 0 times concordantly or discordantly; of these: 2855806 mates make up the pairs; of these: 1811952 (63.45%) aligned 0 times 952634 (33.36%) aligned exactly 1 time 91220 (3.19%) aligned >1 times 97.85% overall alignment rate [bam_sort_core] merging from 48 files... real 18m58.772s user 262m43.108s sys 18m16.408s 4397279982 Sep 26 12:30 wt0h1_trm.bam samtools flagstat wt0h1_trm.bam 91733796 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 89921844 + 0 mapped (98.02%:-nan%) 91733796 + 0 paired in sequencing 45880607 + 0 read1 45853189 + 0 read2 88453682 + 0 properly paired (96.42%:-nan%) 88924444 + 0 with itself and mate mapped 997400 + 0 singletons (1.09%:-nan%) 177280 + 0 with mate mapped to a different chr 96677 + 0 with mate mapped to a different chr (mapQ>=5) ln -s wt01_R1_trm.fastq.gz wt01_R1_trm_1.fastq.gz ln -s wt01_R2_trm.fastq.gz wt01_R2_trm_2.fastq.gz PATH=/data/results/tools/align/bowtie2-2.3.2:$PATH; export PATH time /data/results/tools/align/tophat-2.1.1.Linux_x86_64/tophat2 -o wt0h1 -p 18 /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome wt01_R1_trm_1.fastq.gz wt01_R2_trm_2.fastq.gz [2018-09-26 15:45:54] A summary of the alignment counts can be found in wt0h1/align_summary.txt [2018-09-26 15:45:54] Run complete: 02:33:08 elapsed real 153m8.785s user 1385m54.136s sys 358m22.076s samtools flagstat /data/images/proton/DKlab/mr/rnaseq/wt0h1/accepted_hits.bam 117816393 + 0 in total (QC-passed reads + QC-failed reads) 0 + 0 duplicates 117816393 + 0 mapped (100.00%:-nan%) 117816235 + 0 paired in sequencing 59273512 + 0 read1 58542723 + 0 read2 243168 + 0 properly paired (0.21%:-nan%) 113711582 + 0 with itself and mate mapped 4104653 + 0 singletons (3.48%:-nan%) 107366004 + 0 with mate mapped to a different chr 59731888 + 0 with mate mapped to a different chr (mapQ>=5) /data/results/tools/align/tophat-2.1.1.Linux_x86_64/tophat2 -o wt0h1_v2 -p 18 --library-type fr-unstranded -g 1 /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/Bowtie2Index/genome wt01_R1_trm_1.fastq.gz wt01_R2_trm_2.fastq.gz #hisat untrimmed samtools view -F 4 wt0h1.bam |awk -f /data/images/proton/DKlab/mr/rnaseq/get-multimap-stats.awk #reads 41481938 #mappings 89288170 1 900631 2.17114 2 38271226 92.26 3 25177 0.0606939 4 1604016 3.86678 5 10580 0.0255051 6 282620 0.681309 7 589 0.0014199 8 135042 0.325544 9 157 0.000378478 10 251900 0.607252 # v1-alignment samtools view -F 4 /data/images/proton/DKlab/orsalia/sortedwt01pos.bam |awk -f /data/images/proton/DKlab/mr/rnaseq/get-multimap-stats.awk #reads 39446071 #mappings 78892142 2 39446071 100 # v1-alignment, head -2 HWI-ST521:209:C3DHAACXX:2:1101:1826:14212 99 chr3 58326828 50 81M = 58326935 191 GGCATTTCAATCAAAACTTTAGTTGATGTGCCTTTTTGGTATTAAAAGTGCAATTAACATGCTAAGATATCATACGGCACA <<< (default: 5) report up to alns per read