# with genomic mRNA, the longest transcript including all introns is meant. Some of these are huge (>2Mbases). Almost the same as pure genomic ali. # cd /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/ awk -f get-introns-max1k.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64.gtf > Mus_musculus.NCBIM37.64-toMM9.max-intron-1k.gtf /data/results/tools/align/cufflinks-2.2.1.Linux_x86_64/gffread -w Mus_musculus.NCBIM37.64-toMM9.merged -g /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.introns.gtf -M -d dupinfo.txt # 259338129 Mar 22 19:10 Mus_musculus.NCBIM37.64-toMM9.merged ln -s Mus_musculus.NCBIM37.64-toMM9.max-intron-1k mm9-genomic-mRNA.fa python /data/results/tools/align/SHRiMP_2_2_3/SHRiMP_2_2_3/utils/project-db.py --seed 010111111111111,101011111111111,110101111111111,111010111111111,111101011111111,111110101111111,111111010111111,111111101011111,111111110101111,111111111010111,111111111101011,111111111110101,111111111111010,111111111111101,111111111111111 --h-flag --shrimp-mode cs mm9-genomic-mRNA.fa # v0 cat /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf | awk -f /data/results/reference/get-transcript-range1.awk > Mus_musculus.NCBIM37.64-toMM9.trancript-ranges.gtf Longest range 4434881 for ENSMUST00000127664 /data/results/tools/align/cufflinks-2.2.1.Linux_x86_64/gffread -O -w Mus_musculus.NCBIM37.64-toMM9.fa -g /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa Mus_musculus.NCBIM37.64-toMM9.transcript-ranges.gtf ln -s Mus_musculus.NCBIM37.64-toMM9.fa mm9-genomic-transcripts.fa mv mm9-genomic-transcripts.fa mm9-genomic-mRNA.fa python /data/results/tools/align/SHRiMP_2_2_3/SHRiMP_2_2_3/utils/project-db.py --seed 010111111111111,101011111111111,110101111111111,111010111111111,111101011111111,111110101111111,111111010111111,111111101011111,111111110101111,111111111010111,111111111101011,111111111110101,111111111111010,111111111111101,111111111111111 --h-flag --shrimp-mode cs mm9-genomic-mRNA.fa #export SHRIMP_FOLDER=/data/results/tools/align/SHRiMP_2_2_3 # $SHRIMP_FOLDER/bin/gmapper-cs -L /data/results/reference/mmu/mm9/mRNA-genomic/mm9-genomic-mRNA-cs /data/images/proton/DKlab/mr/parclip/raw/0hrep3/ugc_604_7_F3.csfasta -N 20 -n 1 --local -o 10 -v 20% -h 20% -r 30% -w 150% -e -255 -f -255 | samtools view -Sb - | samtools sort -o - - >0hrep3-15mMm.bam 2>0hrep3-15mMm-bam.log ./shrimp-map-genomic-mRNA1.sh &> shrimp-map-genomic-mRNA1.log & for i in ?h*Mm.bam.md.bam do echo $i /data/results/tools/samtools/samtools-1.3/samtools calmd -b $i /data/results/reference/mmu/mm9/mRNA-genomic/mm9-genomic-mRNA.fa > foo.bam samtools view -h foo.bam | awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/sam2bowtie.awk > $i.md.bt mv foo.bam $i.md.bam done -rw-r--r-- 1 reczko users 148557509 Jan 10 18:11 -.0000.bam -rw-r--r-- 1 reczko users 0 Jan 10 15:22 2hrep2-15mMm.bam -rw-r--r-- 1 reczko users 73 Jan 10 15:22 2hrep2-15mMm-bam.log -rw-r--r-- 1 reczko users 4987 Jan 10 19:18 shrimp-map-genomic-mRNA1.log 100000 126539 8435. 200000 114114 7607. 300000 111847 7456. 400000 114835 7655. 500000 117850 7856. 600000 109587 7305. -rw-r--r-- 1 reczko users 148557509 Jan 10 18:11 -.0000.bam -rw-r--r-- 1 reczko users 146582742 Jan 10 20:19 -.0001.bam -rw-r--r-- 1 reczko users 148630312 Jan 10 22:32 -.0002.bam -rw-r--r-- 1 reczko users 150315507 Jan 11 00:44 -.0003.bam -rw-r--r-- 1 reczko users 148262440 Jan 11 02:55 -.0004.bam -rw-r--r-- 1 reczko users 148821999 Jan 11 05:03 -.0005.bam -rw-r--r-- 1 reczko users 150728759 Jan 11 07:13 -.0006.bam -rw-r--r-- 1 reczko users 153555758 Jan 11 09:20 -.0007.bam -rw-r--r-- 1 reczko users 151645868 Jan 11 11:31 -.0008.bam -rw-r--r-- 1 reczko users 0 Jan 10 15:22 2hrep2-15mMm.bam -rw-r--r-- 1 reczko users 73 Jan 10 15:22 2hrep2-15mMm-bam.log lrwxrwxrwx 1 reczko users 42 Jan 11 11:42 .#README-shrmip-genomic-mRNA.txt -> reczko@max.fleming.local.114460:1458829862 -rw-r--r-- 1 reczko users 1739 Jan 10 15:22 README-shrmip-genomic-mRNA.txt -rw-r--r-- 1 reczko users 1673 Jan 10 14:58 README-shrmip-genomic-mRNA.txt~ -rw-r--r-- 1 reczko users 5360 Jan 11 11:11 shrimp-map-genomic-mRNA1.log -rwxr--r-- 1 reczko users 3955 Jan 10 15:21 shrimp-map-genomic-mRNA1.sh 100000 126539 8435. 200000 114114 7607. 300000 111847 7456. 400000 114835 7655. 500000 117850 7856. 600000 109587 7305. 700000 109807 7320. 800000 112938 7529. 900000 110792 7386. 1000000 112044 7469. 1100000 112111 7474. 1200000 117267 7817. 1300000 111342 7422. 1400000 114061 7604. 1500000 113067 7537. 1600000 115095 7673. 1700000 113361 7557. 1800000 117795 7853. 1900000 113579 7571. 2000000 112691 7512. 2100000 112134 7475. 2200000 115896 7726. 18h for 2.2M (/ 18 2.2) (* 8 8.181818181818182) (/ 65.45454545454545 24) (* 6 2.727272727272727) 16.363636363636363 (days @30cores) cd /data/results/reference/mmu/mm9/mRNA-genomic/ /home/reczko/bin/faToTwoBit -noMask mm9-genomic-mRNA.fa mm9-genomic-mRNA.2bit #convert bam to bowtie /data/images/proton/DKlab/mr/parclip/shrimp/s2b.sh #@ restrict intron size to 15k all: 3.1G Feb 15 18:53 ../mRNA-genomic/Mus_musculus.NCBIM37.64-toMM9.fa https://link.springer.com/content/pdf/10.1007%2Fs11427-013-4540-y.pdf => 6k https://www.ncbi.nlm.nih.gov/pubmed/15217358 => 11k cat /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf | awk -f get-transcript-range-max-intron-15k-1.awk > Mus_musculus.NCBIM37.64-toMM9.trancript-ranges.gtf Longest range 3009068 for ENSMUST00000124096 Longest range 2257238 for ENSMUST00000114000 Longest range 2241308 for ENSMUST00000114641 Longest range 2072513 for ENSMUST00000107287 2.9G Feb 15 19:14 Mus_musculus.NCBIM37.64-toMM9.fa -i discard transcripts having an intron larger than /data/results/tools/align/cufflinks-2.2.1.Linux_x86_64/gffread -O -w Mus_musculus.NCBIM37.64-toMM9.fa -g /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa Mus_musculus.NCBIM37.64-toMM9.transcript-ranges.gtf ln -s Mus_musculus.NCBIM37.64-toMM9.fa mm9-genomic-transcripts.fa mv mm9-genomic-transcripts.fa mm9-genomic-mRNA.fa python /data/results/tools/align/SHRiMP_2_2_3/SHRiMP_2_2_3/utils/project-db.py --seed 010111111111111,101011111111111,110101111111111,111010111111111,111101011111111,111110101111111,111111010111111,111111101011111,111111110101111,111111111010111,111111111101011,111111111110101,111111111111010,111111111111101,111111111111111 --h-flag --shrimp-mode cs mm9-genomic-mRNA.fa