# read quality assessment: /data/images/proton/DKlab/mr/parclip/raw/pipeline1.sh /data/results/tools/solid/qc-filter/SOLiD_preprocess_filter_v2_MR1.pl -a on -o 0hrep1filt2 -f 0hrep1/ugc_604_1_F3.csfasta -g 0hrep1/ugc_604_1_F3.QV.qual wc 0hrep1/ugc_604_1_F3.csfasta 22967710 22967710 798859510 0hrep1/ugc_604_1_F3.csfasta wc 0hrep1filt_T_F3.csfasta 9835260 9835260 341609082 0hrep1filt_T_F3.csfasta # standart qual. filter removes >50% of reads # INDEX BUILDING # get ENSEMBL gtf annotation (for mm10) /data/results/reference/mmu/Mus_musculus.NCBIM37.64.gtf # lift coordinates to mm9 using https://genome.ucsc.edu/cgi-bin/hgLiftOver # prepare gtf for index building cd /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/ awk -f get-introns-max1k-v2.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64.gtf > Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.gtf # extract fasta seqs from whole genome using gtf /data/results/tools/align/cufflinks-2.2.1.Linux_x86_64/gffread -O -F -w Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa -g /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.gtf -M -d dupinfo.tr-with-1k-introns.txt # generate 2bit file needed for PARalyzer ln -s Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa mm9-mRNA-introns1k.fa /home/reczko/bin/faToTwoBit mm9-mRNA-introns1k.fa mm9-mRNA-introns1k.2bit #build index for shrimp export SHRIMP_FOLDER=/data/results/tools/align/SHRiMP_2_2_3 python /data/results/tools/align/SHRiMP_2_2_3/SHRiMP_2_2_3/utils/project-db.py --seed 010111111111111,101011111111111,110101111111111,111010111111111,111101011111111,111110101111111,111111010111111,111111101011111,111111110101111,111111111010111,111111111101011,111111111110101,111111111111010,111111111111101,111111111111111 --h-flag --shrimp-mode cs mm9-mRNA-introns1k.fa #prepare regions for annotation reczko@max:/data/results/reference/mmu$ awk -f get-ncRNA2.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf > ncRNA.bed rm intron.bed cds.bed 3utr.bed 5utr.bed ; awk -f /data/results/tools/rnaseq/get-3utr4.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf > /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.3utr #ALIGNMENT # doc in /data/results/tools/align/SHRiMP_2_2_3/README.SHRiMP132 #one sample export SHRIMP_FOLDER=/data/results/tools/align/SHRiMP_2_2_3 $SHRIMP_FOLDER/bin/gmapper-cs -L /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/mm9-mRNA-introns1k-cs /data/images/proton/DKlab/mr/parclip/raw/2hrep2/*.csfasta -N 11 -n 1 --local -o 10 -v 20% -h 20% -r 30% -w 150% -e -255 -f -255 | samtools view -Sb - | samtools sort -o - - >2hrep2-15mMm1kIntron.bam 2>2hrep2-15mMm1kIntron-bam.log -N/--threads Number of Threads (default: 1) -n/--cmw-mode Match Mode (default: unpaired:2 paired:4) --local Perform local alignment (default: disabled) -o/--report Maximum Hits per Read (default: 10) -v/--vec-threshold SW Vector Hit Threshold (default: 47.00%) -h/--full-threshold SW Full Hit Threshold (default: 50.00%) -r/--cmw-threshold Window Generation Threshold (default: 55.00%) -w/--match-window Match Window Length (default: 140.00%) -e/--ext-r SW Gap Extend Score(Reference)(default: -7) -f/--ext-q SW Gap Extend Score (Query) (default: -3) #all samples: /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA1.sh /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA2.sh #alignment statistics in: /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA1.logB /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA2.log #get mismatch stats /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/get-mismatch-stats1.sh #PARALYZER pre-processing #prepare flags in bam (input $i) and convert to bowtie format (output $i.md.bt): /data/images/proton/DKlab/mr/parclip/paralyzer/README-paralyzer15.txt for i in *-15mMm1kIntron.bam do echo $i # add MD flags for bowtie format /data/results/tools/samtools/samtools-1.3/samtools calmd -b $i /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa >& /dev/null > foo.bam2 samtools view -h foo.bam2 | awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/sam2bowtie.awk2 > $i.md.bt mv foo.bam2 $i.md.bam done #PARALYZER #one sample: ./myp-100chunks.sh /data/images/proton/DKlab/mr/parclip/paralyzer/ini-files-1kIntron/sh.IGG.ini &> IGG.1kIntrons-log2 ./mergeMyPar-100chunks.sh sh-clusters-IGG.txt2-1kIntrons wc -l sh-clusters-IGG.txt2-1kIntrons.csv ./mergeMyPar-100chunks.sh sh-groups-IGG.txt2-1kIntrons ./mergeMyPar-100chunks.sh sh-distributions-IGG.txt2-1kIntrons #all samples: /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/myp-all-1kIntrons2.sh #post-processing # paralyzer to bed, more than 5 TtoC for i in sh-clusters-*.txt2-1kIntrons2.csv do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc.awk $i > $i.bed.gt5tc done for i in sh-clusters-*.txt2-1kIntrons2.csv do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc-plus.awk $i > $i.bed.gt5tc.plus wc $i.bed.gt5tc.plus done t=read.table("sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus",sep="\t") ms=t$V5 summary((ms)) modscore_analyis(ms,"IL4") #0.5086 0.7810 0.8472 0.8496 0.9235 1.0000 #modscore_cutoff for IL4 0.984 0.154163276322031 summary(t$V5>=0.984) #3251 write.table(t[t$V5>=0.984,],"sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt",row.names=F,col.names=F,quote=FALSE,sep="\t") t=read.table("sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus",sep="\t") ms=t$V5 summary((ms)) modscore_analyis(ms,"IGG") # 0.5661 0.7798 0.8439 0.8469 0.9174 1.0000 # [1] "modscore_cutoff for IGG 0.974 0.285225342489041" write.table(t[t$V5>=0.974,],"sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt",row.names=F,col.names=F,quote=FALSE,sep="\t") summary(t$V5>=0.974) #1205 #IGG removal for i in sh-clusters-*.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt do wc $i bedtools intersect -a $i -b sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt -v -s > $i.noIGG wc $i.noIGG done # 3251 19506 525188 sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt # 3219 19314 519834 sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG rm sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG #empty #add 5/3utr/cds info: for i in sh-clusters-*.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic.bed" done # add E/I , PCG anno awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG > sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed #get genomic coords, keep anno for i in sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic2.bed" done #bookend merge clusters overlapping in replicates for i in sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed.genomic2.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6 -o collapse,max,distinct > $i".collapsed" done for i in *f.flt*.genomic.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster1.awk $i > $i".best.bed" done # get info from distributions # /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk adds: ## 1..6(same) AvgConversionPct MaxConversionPct ConversionEventCount SdevConversionPct for i in sh-distributions-???*.txt2-1kIntrons2.csv do echo $i cat $i| awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk|awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa-dist2bed2.awk > $i".avg.csv3.bed" done # merge info from distributions and clusters bedtools intersect -a sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-IL4.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-IGG.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep1.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep1.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep2.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep2.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep3.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep3.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 # separate plus strand results for i in sh-clusters-*1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 do wc $i awk -f ../filterPlusStrand.awk $i > $i.plus wc !$ done #1118143 6708858 197457040 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 # 607183 3643098 107466384 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 224576 1347456 41122435 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 126400 758400 23175703 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus (/ 126400 607183.0) 0.20817447128789837 # IGG separate # more than 4 reads and convEvPct > 5: # cp=1.0*nc/(nc+nnc); # if ((cp>0.05)&&(nr>=5)) {print $0;} awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads.awk sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus > sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.f # more than 4 reads: # and IGG removal: for i in sh-clusters-*1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus do wc $i bedtools intersect -a $i -b sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.f -v -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads.awk |sort -k1,1 -k2,2n > $i.noIGG.f wc !$ done 86718 520308 15500167 sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 81990 491940 14644251 sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 40699 244194 7204283 sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 38699 232194 6846831 sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 112849 677094 20301783 sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 109373 656238 19664891 sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 41427 248562 7313012 sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 28145 168870 4993566 sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 126400 758400 23175703 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 116449 698694 21331311 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f #IL4 check # adaptive filtering of modscore a) get cutoff #topuniform=max(which(lp8>0.05)); #highest modscore that is not uniformly distributed Rscript /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/get_modescore_hist_1kIntrons3_f.r [1] "modscore_cutoff for IFN 0.946 0.0591007877911048" ^^^^^ this is the modscore cutoff awk -v th=0.972 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt #IL4 check # 3819 22914 733905 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.979 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.97 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # 1334 8004 245720 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.964 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # 4059 24354 765801 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # add E/I , PCG anno awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed #IL4 check #get genomic coords, keep anno awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done 472 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.bed 110 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.bed 587 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.bed #IL4 check for i in /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.3utr.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.3utr.bed > $i".3utr.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.5utr.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.5utr.bed > $i".5utr.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.cds.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.cds.bed > $i".cds.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.intron.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.intron.bed > $i".intron.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/ncRNA.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.ncRNA.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.ncRNA.bed > $i".ncRNA.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/exon.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.exon.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.exon.bed > $i".exon.best.bed" done #461 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.bed #103 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.bed #566 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.bed #3302 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.bed #147 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.bed #1349 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.bed # (+ 461 103 566 147) #1277 awk -v fe="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.best.bed" -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats3.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed 3 intron sense_intronic 3 intron non_coding 147 ncRNA 1 intron IG_V_gene 22 intron transcribed_unprocessed_pseudogene 15 3utr nonsense_mediated_decay 192 3utr protein_coding 3 5utr nonsense_mediated_decay 5 intron unprocessed_pseudogene 29 5utr protein_coding 4 intron transcribed_processed_pseudogene 3 cds nonsense_mediated_decay 175 cds protein_coding 62 intron antisense 321 intron nonsense_mediated_decay 2773 intron protein_coding 519 intron retained_intron 512 intron processed_transcript 3 intron pseudogene 32 other nonsense_mediated_decay 1st max: 3utr 21 5utr 3 cds 8 2nd max: 3utr 0 5utr 0 cds 0 344 other protein_coding 1st max: 3utr 182 5utr 35 cds 127 2nd max: 3utr 0 5utr 0 cds 23 4 intron ambiguous_orf 74 intron lincRNA # exons without CDS of e.g. retained_introns were not included, add all exons as category for this case: awk -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats2.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed chr4 44922000 44922104 ENSMUST00000067521_G6098.1_AAATAATGTCAGTAATTCAATATTGTCTTACTATTTAATACATTTTTGATGTATTAAATAGTACTGAGAGACCATTTCTTTTAATTATACAGTTATATAGCACAA_128_1231_5_9_1628_0.9819204955885362_1.000000_149_0.000000 1000 + ENSMUST00000142714|Zcchc7|nonsense_mediated_decay # cluster in 2 regions (all in ncRNA): awk '{i=++o[$4];t[$4,i]=$7"|"$8"|"$9;}END{for (i in o) {if (o[i]>1){print i;for (k=1;k<=o[i];k++){print t[i,k]}}}}' sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed > foo #use genomic_chr genomic_start genomic_end genomic_strand transcript_id as unique key awk -v fe="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.best.bed" -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats5.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed2 2 intron sense_intronic 2 intron non_coding 2 ncRNA snoRNA 1 ncRNA rRNA 9 3utr nonsense_mediated_decay 355 3utr protein_coding 1 intron IG_V_gene 5 cds nonsense_mediated_decay 19 intron transcribed_unprocessed_pseudogene 277 cds protein_coding 1 5utr nonsense_mediated_decay 5 intron unprocessed_pseudogene 49 5utr protein_coding 2 intron transcribed_processed_pseudogene 47 intron antisense 38 ncRNA processed_transcript 125 intron nonsense_mediated_decay 2221 intron protein_coding 1 ncRNA snRNA 251 intron retained_intron 274 intron processed_transcript 33 ncRNA pseudogene 1 ncRNA misc_RNA 2 intron pseudogene 44 ncRNA lincRNA 15 ncRNA miRNA 54 intron lincRNA Dear George, at /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed you can find results for IL4. Some genes might need to be collapsed, as there are only 3819 clusters this might not be a problem. I'm finishing collapsing (also for 0h) until tomorrow. BW, Martin #get genomic coords, keep anno awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed # get best cluster in collapsed for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster2.awk $i > $i".best.bed" done #assign exon vs intron based on majority, exon for ties for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do awk -f ../per-gene-exon-intron-stats1a.awk $i > $i.exon-intron done # intersect exon-intron-anno with transcript clusters for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2 do awk '{if ($NF=="E"){print $0}}' $i > $i.exon done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2 do awk '{if ($NF=="I"){print $0}}' $i > $i.intron wc -l !$ done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon do bedtools intersect -v -a $i -b /data/results/reference/mmu/cds.bed /data/results/reference/mmu/5utr.bed /data/results/reference/mmu/3utr.bed -wa -s |sort -k1,1 -k2,2n > $i.ncRNA wc -l !$ done #here awk -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.3utr.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.5utr.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed (# filter protein_coding_genes and add annotation from /data/results/reference/mmu/mm9/1kIntrons2-stranded/Mus_musculus.NCBIM37.64-toMM9.headers for i in *txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron1.awk $i |sort -k1,1 -k4,4 -k2,2n -k5,5n > $i.anno1.csv done ) for i in sh*rep*1kIntrons2*f.flt do cat $i | sort -k1,1 -k2,2n > $i.srt done bedtools multiinter -i sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 bedtools multiinter -i sh-clusters-2hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-2hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 bedtools multiinter -i sh-clusters-6hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-6hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 ## 2of2 #add paralyzer details to intersection: awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed awk -v f1="sh-clusters-2hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-2hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed awk -v f1="sh-clusters-6hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-6hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed #merge book-ended regions: awk -f ../get-merged-multiintersect-details1.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed #get max scoring cluster, add max avgConvPct as score awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed (for i in *1kI*flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks2.awk $i > $i".genomic.bed" done for i in *genomic.bed do echo $i sort -k1,1 -k2,2n $i | awk -f /data/images/proton/DKlab/mr/parclip/tracks/correct-bed.awk > $i.srt; done ) awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-1of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.bed awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed # add E/I , PCG anno for i in *.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done for i in sh-clusters-I??.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done #get genomic coords, keep anno for i in *anno1.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic2.bed" done #bookend merge for i in *.anno1.bed.genomic2.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8,9 -o collapse,max,distinct,collapse,collapse,collapse > foo > $i".collapsed" done # get best cluster in collapsed for i in *.anno1.bed.genomic2.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster2.awk $i > $i".best.bed" done # intersect exon-intron-anno with transcript clusters for i in *.anno1.bed.genomic2.bed.collapsed do bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 done for i in *.anno1.bed.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="E"){print $0}}' $i > $i.exon done for i in *.anno1.bed.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="I"){print $0}}' $i > $i.intron wc -l !$ done for i in *.anno1.bed.genomic2.bed.collapsed.anno2.exon do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done for i in *.anno1.bed.genomic2.bed.collapsed.anno2.exon do bedtools intersect -v -a $i -b /data/results/reference/mmu/cds.bed /data/results/reference/mmu/5utr.bed /data/results/reference/mmu/3utr.bed -wa -s |sort -k1,1 -k2,2n > $i.ncRNA wc -l !$ done #2here #merge per-gene parts awk -v f3="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed #recollect per-gene for i in *f.flt.*anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done for i in *f.flt*.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done #all zero, as expected for i in *f.flt*.anno1.bed.genomic2.bed.collapsed.anno2.intron do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-2h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-6h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.flt.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.flt.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed 2nd annotation, clusters in PCGs: 7398 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1732 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 8133 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 9720 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 5917 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1050 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 6960 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 7776 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 8438 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 2296 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 10468 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 11794 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 2640 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 663 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 3175 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 3612 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 4213 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 884 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 5079 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 5674 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 2nd annotation, ncRNA clusters overlapping with introns: 2523 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2084 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2985 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 940 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 1543 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2nd annotation, clusters in introns (both PGC/ncRNA): 292 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 360 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2193 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 291 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 286 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2034 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 329 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 426 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2569 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 94 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 149 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 752 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 167 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 202 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 1137 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed