#@ add de-duplication #@ filter clusters EnsemblID = transcript ID ClusterID = unique ID for the cluster ReadCount = number of reads that overlap the cluster by at least 1 nucleotide ModeLocation = coordinate of the location with the highest signal / (signal + background) value ConversionLocationCount = number of unique location where at least 1 conversion occurred ConversionEventCount = total number of conversions that occurred within the cluster NonConversionEventCount = total number of possible conversion events that did not occur ModeScore = score of the highest signal / (signal + background) value AvgConversionPct = average conversion % of all conversions in the group containing the cluster GroupConversionEventCount = number of all conversions in the group containing the cluster SdevConversionPct = sdev of conversion % of all conversions in the group containing the cluster MaxConversionPct = max. conversion % of all conversions in the group containing the cluster conversionFreq = ConversionEventCount / (ConversionEventCount + NonConversionEventCount) # filter: conversionFreq >0.05 && (ReadCount>=5) cd /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/ /home/reczko/bin/faToTwoBit mm9-mRNA-introns1k.fa mm9-mRNA-introns1k.2bit for i in IFN-15mMm.bam for i in 0hrep3-15mMm1kIntron.bam for i in 6hrep3-15mMm1kIntron.bam for i in IGG-15mMm.bam for i in 2hrep?-15mMm1kIntron.bam do echo $i # add MD flags for bowtie format # /data/results/tools/samtools/samtools-1.3/samtools calmd -b $i /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/Mus_musculus.NCBIM37.64-toMM9.max-intron-1k >& /dev/null > foo.bam2 /data/results/tools/samtools/samtools-1.3/samtools calmd -b $i /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa >& /dev/null > foo.bam2 samtools view -h foo.bam2 | awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/sam2bowtie.awk > $i.md.bt mv foo.bam2 $i.md.bam done #uor cd /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k cat 0hrep1-15mMm.bam.md.bt 0hrep2-15mMm.bam.md.bt 0hrep3-15mMm1kIntron.bam.md.bt > 0h-15mMm1kIntron.bam.md.bt wc 0h-15mMm1kIntron.bam.md.bt 222931180 1783449423 29141795018 0h-15mMm1kIntron.bam.md.bt gcc omit-reads.c -lJudy -o omit-reads ./omit-reads /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/0h-15mMm1kIntron.bam.md.bt # sanity check head -99999 /data/images/proton/DKlab/mr/parclip/shrimp-genomic-mRNA/IFN-15mMm1kIntron2.bam.md.bt | grep "T>C" | wc 13530 108240 1690155 ]0;/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1dreczko@max:/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d$ head -99999 /data/images/proton/DKlab/mr/parclip/shrimp/0hrep1-15mMm.bam.md.bam.bt | grep "T>C" | wc 14657 117256 1463303 At /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1b/res/recommended_settings_with_sdev are the clusters with the sdev info added, format: EnsemblID = transcript ID ClusterID = unique ID for the cluster ReadCount = number of reads that overlap the cluster by at least 1 nucleotide ModeLocation = coordinate of the location with the highest signal / (signal + background) value ConversionLocationCount = number of unique location where at least 1 conversion occurred ConversionEventCount = total number of conversions that occurred within the cluster NonConversionEventCount = total number of possible conversion events that did not occur ModeScore = score of the highest signal / (signal + background) value AvgConversionPct = average conversion % of all conversions in the group containing the cluster GroupConversionEventCount = number of all conversions in the group containing the cluster SdevConversionPct = sdev of conversion % of all conversions in the group containing the cluster MaxConversionPct = max. conversion % of all conversions in the group containing the cluster #needed for PARalyzer cs /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k /home/reczko/bin/faToTwoBit mm9-genomic-mRNA.fa mm9-genomic-mRNA.2bit /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/myp-all-1kIntrons2.sh # paralyzer to bed, more than 5 TtoC #/data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc.awk #in: paralyzer format # 1 2 3 4 5 6 7 8 9 10 11 12 #Chromosome,Strand,ClusterStart,ClusterEnd,ClusterID,ClusterSequence,ReadCount,ModeLocation,ModeScore,ConversionLocationCount,ConversionEventCount,NonConversionEventCount #out: bed format # 1 2 3 4 _ 5 _ 6 _ 7 _ 8 _ 9 _ 10 11 12 # 1 2 3 4 5 6 #Chromosome,ClusterStart,ClusterEnd,ClusterID,ClusterSequence,ReadCount,ModeLocation,ConversionLocationCount,ConversionEventCount,NonConversionEventCount,ModeScore,Strand for i in sh-clusters-*.txt2-1kIntrons.csv do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc.awk $i > $i.bed.gt5tc done # /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk adds: ## 1..6(same) AvgConversionPct MaxConversionPct ConversionEventCount SdevConversionPct #to dist # /data/images/proton/DKlab/mr/parclip/paralyzer/pa-dist2bed2.awk #paralyzer+ format #Chromosome,Strand,ClusterStart,ClusterEnd,ClusterID,InfoType,AvgConversionPct,MaxConversionPct,ConversionEventCount,SdevConversionPct #1 ,2 ,3 ,4 ,5 ,6 ,7 ,8 ,9 ,10 #out: bed format # 1 2 3 4 _ (7) _ (9) _ (10) 5 _ 6 #Chromosome,ClusterStart,ClusterEnd,ClusterID,AvgConversionPct,ConversionEventCount,SdevConversionPct, MaxConversionPct ,Strand for i in sh-distributions-???*.txt2-1kIntrons.csv do echo $i cat $i| awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk|awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa-dist2bed2.awk > $i".avg.csv3.bed" done # intersect maxTtoC_gt_0.25 with clusters # (note: /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk adds # AvgConversionPct_ConversionEventCount_SdevConversionPct MaxConversionPct # ) #in: # 1 2 3 4 5 6 7 8 9 10 11 12 #in: ENSMUST00000000254-chr6-+-128884399-128885000 278 324 G5.2_ATTAGTTATTCTATTGGAGTATACAATACTCGAATAGTTCTCAGGCA_275_317_5_96_1643 0.7790820460717197 + ENSMUST00000000254-chr6-+-128884399-128885000 163 374 G5_0.004717_212_0.019691 0.126365 + #input 2 parts #cluster # 1 2 3 4 _ 5 _ 6 _ 7 _ 8 _ 9 _ 10 11 12 # 1 2 3 4 5 6 #Chromosome,ClusterStart,ClusterEnd,ClusterID,ClusterSequence,ReadCount,ModeLocation,ConversionLocationCount,ConversionEventCount,NonConversionEventCount,ModeScore,Strand #group/dist # 1 2 3 4 _ (7) _ (9) _ (10) 5 6 #Chromosome,ClusterStart,ClusterEnd,ClusterID,AvgConversionPct,ConversionEventCount,SdevConversionPct, MaxConversionPct ,Strand #input all: #cluster # 1 2 3 4 _ 5 _ 6 _ 7 _ 8 _ 9 _ 10 11 12 # 1 2 3 4 5 6 #Chromosome,ClusterStart,ClusterEnd,ClusterID,ClusterSequence,ReadCount,ModeLocation,ConversionLocationCount,ConversionEventCountC,NonConversionEventCount,ModeScore,Strand #group/dist # 7 8 9 10 11 12 #Chromosome,ClusterStart,ClusterEnd,ClusterID,AvgConversionPct,ConversionEventCountG,SdevConversionPct, MaxConversionPct ,Strand #out #Chromosome,ClusterStart,ClusterEnd,ClusterID_ClusterSequence_ReadCount_ModeLocation_ConversionLocationCount_ConversionEventCountC_NonConversionEventCount_ModeScore_AvgConversionPct_ConversionEventCountG_SdevConversionPct, MaxConversionPct ,Strand # bedtools intersect -a sh-clusters-0hrep1.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-0hrep1.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-2hrep1.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-2hrep1.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-6hrep1.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-6hrep1.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep2.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-0hrep2.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-2hrep2.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-2hrep2.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-6hrep2.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-6hrep2.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep3.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-0hrep3.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-2hrep3.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-2hrep3.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-6hrep3.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-6hrep3.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-IFN.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-IFN.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-IGG.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-IGG.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-IL4.txt2-1kIntrons.csv.bed.gt5tc -b sh-distributions-IL4.txt2-1kIntrons.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk> sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 # separate plus strand results for i in sh-clusters-*1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 do wc $i awk -f ../filterPlusStrand.awk $i > $i.plus wc !$ done 754552 4527312 131501428 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 406385 2438310 71003130 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 508797 3052782 88325829 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 275171 1651026 47880004 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 372362 2234172 63783284 sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 200678 1204068 34448891 sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 1118143 6708858 197457040 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 607183 3643098 107466384 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus #here awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads_convEvPctGT5.awk sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus > sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.f #remove IGG regions for i in sh-clusters-*1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus do wc $i bedtools intersect -a $i -b sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.f -v -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads_convEvPctGT5.awk |sort -k1,1 -k2,2n > $i.noIGG.f wc !$ done 406385 2438310 71003130 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 201014 1206084 34939979 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 230535 1383210 39925141 sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 124325 745950 21474386 sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 630172 3781032 111461295 sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 319033 1914198 56190410 sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 248467 1490802 43285179 sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 123739 742434 21475080 sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 441243 2647458 78017048 sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 229227 1375362 40446701 sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 381937 2291622 66503792 sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 200458 1202748 34728724 sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 592162 3552972 103271917 sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 285681 1714086 49226781 sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 506498 3038988 89275744 sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 245419 1472514 43066514 sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 240866 1445196 41752395 sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 133504 801024 23118415 sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 275171 1651026 47880004 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 155189 931134 26941627 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 200678 1204068 34448891 sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 0 0 0 sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 607183 3643098 107466384 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 279278 1675668 48690486 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f #topuniform=max(which(lp8>0.05)); #highest modscore that is not uniformly distributed Rscript /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/get_modescore_hist_1kIntrons_f.r [1] "modscore_cutoff for IFN 0.946 0.0591007877911048" [1] "modscore_cutoff for IL4 0.96 0.0629382265671651" [1] "modscore_cutoff for 0h_rep1 0.95 0.225209996303595" [1] "modscore_cutoff for 0h_rep2 0.946 0.099803290998472" [1] "modscore_cutoff for 0h_rep3 0.946 0.0616650068898846" [1] "modscore_cutoff for 2h_rep1 0.961 0.0985282900778099" [1] "modscore_cutoff for 2h_rep2 0.957 0.113417438308439" [1] "modscore_cutoff for 2h_rep3 0.941 0.061839882253766" [1] "modscore_cutoff for 6h_rep1 0.951 0.0611335641354568" [1] "modscore_cutoff for 6h_rep2 0.952 0.340071518567869" [1] "modscore_cutoff for 6h_rep3 0.962 0.0690384376905233" awk -v th=0.945 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ # 21295 127770 3826706 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.954 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ # 33981 203886 6151118 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.948 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ # 24170 145020 4364548 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.946 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.956 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.956 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.936 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.932 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.941 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.937 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ awk -v th=0.934 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt wc !$ #if 0 #2do include introns+cds # filter protein_coding_genes and add annotation from /data/results/reference/mmu/mm9/1kIntrons-stranded/Mus_musculus.NCBIM37.64-toMM9.headers for i in *txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron1.awk $i |sort -k1,1 -k4,4 -k2,2n -k5,5n > $i.anno1.csv done #make genomicbed+tracks: #ENSMUST00000164402-chr2-+-25430951-25435619 3303 3331 G14114.1_CAGTACATCGCGTTCTCGGTACCTCCCAA_23_3332_4_13_118_0.9630921701788632_1.000000_65_0.000000 1.000000 + # use genomic coords in $1 for mapping cd /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d for i in *1kI*f.flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks2.awk $i > $i".genomic.bed" done cd /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks for i in /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/*genomic.bed do ln -s $i done #if tracks needed for i in *genomic.bed do echo $i sort -k1,1 -k2,2n $i | awk -f /data/images/proton/DKlab/mr/parclip/tracks/correct-bed.awk > $i.srt; /data/results/tools/gbrowser/bedToBigBed $i.srt /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/mm9.chrom.sizes $i.bb; echo "bigDataUrl=http://genomics-lab.fleming.gr/fleming/DKlab/mr/parclip/tracks/"$i.bb; done http://genomics-lab.fleming.gr/cgi-bin/hgTracks?db=mm9&hubUrl=http://genomics-lab.fleming.gr/fleming/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/hub.txt ENSMUST00000100497-chr5---143664794-143668433 1056 1126 G4560.1_TGATAGTTCGCCATGGATGACGATATCGCTGCGCTGGTCGTCGACAACGGCTCCGGCATGTGCAAAGCCGG_20_1094_8_8_68_0.999106697968966_1.000000_113_0.000000 1.000000 + 0hrep1 ENSMUST00000113676-chr5-+-67698194-67698936 425 459 G6588.1_AGGTCTTAGGGCACATCGCGGTGACCCAGGGTGAC_25_460_4_22_109_0.9998667980625998_0.833333_4_0.333333 1.000000 + ENSMUST00000022142-chr13---101183104-101184631 814 873 G981.1_GCAGAGTTGTGTCCTCTAGTTGTGTCATAGTGATTCAGTCTGAATGTTATATATTGTTTC_14_874_9_14_137_0.9979809824870751_1.000000_60_0.000000 1.000000 + #diff correct-bed2.awk to correct-bed.awk +: start-1 end-1 -: start-1 end-1 # test tracks version for G.Giagkas #source: cd /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/test-tracks for i in *1kI*flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks2.awk $i > $i".genomic.bed" done cd /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/test-tracks for i in *genomic.bed do echo $i sort -k1,1 -k2,2n $i | awk -f /data/images/proton/DKlab/mr/parclip/tracks/correct-bed.awk > $i.srt; /data/results/tools/gbrowser/bedToBigBed $i.srt /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/mm9.chrom.sizes $i.bb; echo "bigDataUrl=http://genomics-lab.fleming.gr/fleming/DKlab/mr/parclip/tracks/"$i.bb; done cd /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/test-tracks cd /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/test-tracks ./update-hub.sh http://genomics-lab.fleming.gr/cgi-bin/hgTracks?db=mm9&hubUrl=http://genomics-lab.fleming.gr/fleming/DKlab/mr/parclip/shrimp-mRNA-introns1k/test-tracks/hub.txt http://genomics-lab.fleming.gr/cgi-bin/hgTracks?udcTimeout=0&db=mm9&hubUrl=http://genomics-lab.fleming.gr/fleming/DKlab/mr/parclip/shrimp-mRNA-introns1k/test-tracks/hub.txt #endif tracks needed # outlier removal using MDS+trees: 0hrep1 2hrep3 6hrep3 # intersect clusters after outlier removal #prepare 2 of 3 intersection for i in sh*rep*1kIn*f.flt do cat $i | sort -k1,1 -k2,2n > $i.srt done # sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt ENSMUST00000170376-chr10---13243648-13244249 268 337 G6665.1_TTGAGTATGATAACTTGTTGCAATAAACTATTTTAATAAAATATAGCTTTGTTTAGTTAATGCTTTTTAG_10_338_5_6_77_0.999817838122349_1.000000_111_0.000000 1.000000 + # sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt ENSMUST00000170376-chr10---13243648-13244249 240 273 G16315.1_GTATAAATTTAATAACTTTGTATTGATTTTGAGT_21_274_4_8_217_0.9780242797679841_0.708333_4_0.343592 1.000000 + ENSMUST00000170376-chr10---13243648-13244249 321 351 G16316.1_TAGTTAATGCTTTTTAGCCATTAATTTATTT_28_352_4_10_258_0.9461657648651448_1.000000_120_0.000000 1.000000 + => 2 clusters overlapping with 1 larger bedtools multiinter -i sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons bedtools multiinter -i sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons bedtools multiinter -i sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons ## 2of2 #add paralyzer details to intersection: awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed awk -v f1="sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed awk -v f1="sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed #merge book-ended regions: awk -f ../get-merged-multiintersect-details1.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed #get max scoring cluster, add max avgConvPct as score awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed wc sh-clusters-?h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed 1641 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed 909 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed 2569 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed ## 1of2 #add paralyzer details to intersection: awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-1of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed awk -v f1="sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-1of2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed awk -v f1="sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-1of2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed #merge book-ended regions: awk -f ../get-merged-multiintersect-details1.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed #get max scoring cluster, add max avgConvPct as score awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed wc -l sh-clusters-?h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed 57403 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed 47320 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed 68119 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed #endif # add E/I , PCG anno for i in *.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron1.awk $i |sort -k1,1 -k4,4 -k2,2n -k5,5n > $i.anno1.csv done #add 5/3utr/cds info: for i in *.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks2.awk $i > $i".genomic.bed" done grep ENSMUST00000170376 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed ENSMUST00000170376-chr10---13243648-13244249 268 273 G6665.1_TTGAGTATGATAACTTGTTGCAATAAACTATTTTAATAAAATATAGCTTTGTTTAGTTAATGCTTTTTAG_10_338_5_6_77_0.999817838122349_1.000000_111_0.000000_1.000000 1.000000 + ENSMUST00000170376-chr10---13243648-13244249 321 337 G6665.1_TTGAGTATGATAACTTGTTGCAATAAACTATTTTAATAAAATATAGCTTTGTTTAGTTAATGCTTTTTAG_10_338_5_6_77_0.999817838122349_1.000000_111_0.000000_1.000000 1.000000 + cluster split due to: sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons:ENSMUST00000170376-chr10---13243648-13244249 268 273 2 1,2 1 1 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons:ENSMUST00000170376-chr10---13243648-13244249 273 321 1 1 1 0 #gap sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons:ENSMUST00000170376-chr10---13243648-13244249 321 337 2 1,2 1 1 for i in *f.flt*.genomic.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s > $i.cds.bed wc -l !$ done 12785 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.3utr.bed 3839 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.5utr.bed 23560 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.cds.bed 5316 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1284 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 10297 sh-clusters-0hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 3315 sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1039 sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 6770 sh-clusters-0hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 9594 sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 2838 sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 16959 sh-clusters-0hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 10257 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.3utr.bed 2243 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.5utr.bed 20418 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.cds.bed 3080 sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 764 sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 6840 sh-clusters-2hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 7271 sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1498 sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 13779 sh-clusters-2hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 5473 sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1541 sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 11598 sh-clusters-2hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 14533 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.3utr.bed 4941 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.5utr.bed 29803 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.cds.bed 7450 sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 3270 sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 15918 sh-clusters-6hrep1.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 7340 sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1739 sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 14471 sh-clusters-6hrep2.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 3908 sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1130 sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 7944 sh-clusters-6hrep3.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 4502 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1406 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 8942 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed 7059 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.3utr.bed 1917 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.5utr.bed 14120 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.cds.bed ( # keep matching clusters for i in *.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.cds.bed wc -l !$ done for i in sh-clusters-I??.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/keep_matching_clusters.awk > $i.cds.bed wc -l !$ done ) #@ merge bookended clusters bedtools merge -i A.bed -c 4 -o collapse ls -lt *srt*.genomic.bed -rw-r--r-- 1 reczko users 10796967 Jun 1 14:11 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed -rw-r--r-- 1 reczko users 294201 Jun 1 14:11 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed -rw-r--r-- 1 reczko users 7005100 Jun 1 14:11 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed -rw-r--r-- 1 reczko users 103642 Jun 1 14:11 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed -rw-r--r-- 1 reczko users 9545239 Jun 1 14:11 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed -rw-r--r-- 1 reczko users 187153 Jun 1 14:11 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed ]0;/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1dreczko@max:/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d$ ls -lt *-I??.*.genomic.bed -rw-r--r-- 1 reczko users 5569621 May 23 12:53 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed -rw-r--r-- 1 reczko users 3593977 May 23 12:53 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed ]0;/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1dreczko@max:/data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d$ for i in *f.flt*.genomic.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6 -o collapse,max,distinct > foo > $i".collapsed" done for i in *f.flt*.genomic.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster1.awk $i > $i".best.bed" done for i in *f.flt*.genomic.bed.collapsed.best.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.cds.bed wc -l !$ done ( bedtools multiinter -i sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.3utr.5utr.cds.bed bedtools multiinter -i sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.3utr.5utr.cds.bed bedtools multiinter -i sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.3utr.5utr.cds.bed bedtools multiinter -i sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.3utr.bed sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.5utr.bed sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.cds.bed > sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.3utr.5utr.cds.bed bedtools multiinter -i sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.3utr.bed sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.5utr.bed sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.cds.bed > sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.3utr.5utr.cds.bed ) ( awk -v f3="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.3utr.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.5utr.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats1.awk > sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.per-gene.bed awk -v f3="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.3utr.bed" -v f5="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.5utr.bed" -v fc="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.bed.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats1.awk > sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.per-gene.bed awk -v f3="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed" -v f5="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed" -v fc="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats1.awk > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed awk -v f3="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed" -v f5="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed" -v fc="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats1.awk > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed awk -v f3="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.3utr.bed" -v f5="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.5utr.bed" -v fc="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.bed.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats1.awk > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed ) Dear Margarita and George, the per-gene processing results as discussed are at /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d: sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.genomic.bed.collapsed.best.per-gene.bed sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.per-gene.bed sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.genomic.bed.collapsed.best.per-gene.bed 0h 6 cds polymorphic_pseudogene 2 5utr IG_V_gene 47 cds IG_V_gene 2 other IG_V_gene 3utr 5utr 2 cds 0 1251 3utr nonsense_mediated_decay 10584 3utr protein_coding 1 cds IG_C_gene 159 5utr nonsense_mediated_decay 2567 5utr protein_coding 648 cds nonsense_mediated_decay 15802 cds protein_coding 156 other nonsense_mediated_decay 1st max: 3utr 114 5utr 42 cds 0 2nd max: 3utr 0 5utr 0 cds 156 1085 other protein_coding 1st max: 3utr 525 5utr 560 cds 0 2nd max: 3utr 0 5utr 2 cds 1083 2h 4 cds polymorphic_pseudogene 23 cds IG_V_gene 2 other IG_V_gene 3utr 5utr 2 cds 0 832 3utr nonsense_mediated_decay 7072 3utr protein_coding 4 cds IG_C_gene 45 5utr nonsense_mediated_decay 1149 5utr protein_coding 390 cds nonsense_mediated_decay 10611 cds protein_coding 86 other nonsense_mediated_decay 1st max: 3utr 74 5utr 12 cds 0 2nd max: 3utr 0 5utr 0 cds 86 576 other protein_coding 1st max: 3utr 310 5utr 266 cds 0 2nd max: 3utr 0 5utr 3 cds 573 6h 2 cds polymorphic_pseudogene 24 cds IG_V_gene 1098 3utr nonsense_mediated_decay 10236 3utr protein_coding 2 cds IG_C_gene 113 5utr nonsense_mediated_decay 2060 5utr protein_coding 542 cds nonsense_mediated_decay 13091 cds protein_coding 138 other nonsense_mediated_decay 1st max: 3utr 105 5utr 33 cds 0 2nd max: 3utr 0 5utr 0 cds 138 849 other protein_coding 1st max: 3utr 437 5utr 412 cds 0 2nd max: 3utr 0 5utr 6 cds 843 IFN 1 3utr IG_V_gene 1 cds polymorphic_pseudogene 10 cds IG_V_gene 383 3utr nonsense_mediated_decay 3532 3utr protein_coding 1 cds IG_C_gene 36 5utr nonsense_mediated_decay 765 5utr protein_coding 197 cds nonsense_mediated_decay 5265 cds protein_coding 48 other nonsense_mediated_decay 1st max: 3utr 36 5utr 12 cds 0 2nd max: 3utr 0 5utr 0 cds 48 325 other protein_coding 1st max: 3utr 129 5utr 196 cds 0 2nd max: 3utr 0 5utr 1 cds 324 IL4 2 cds polymorphic_pseudogene 13 cds IG_V_gene 632 3utr nonsense_mediated_decay 5375 3utr protein_coding 2 cds IG_C_gene 54 5utr nonsense_mediated_decay 982 5utr protein_coding 298 cds nonsense_mediated_decay 7738 cds protein_coding 56 other nonsense_mediated_decay 1st max: 3utr 45 5utr 11 cds 0 2nd max: 3utr 0 5utr 0 cds 56 481 other protein_coding 1st max: 3utr 257 5utr 224 cds 0 2nd max: 3utr 0 5utr 2 cds 479 #@ add intron as categ # add E/I , PCG anno for i in *.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done for i in sh-clusters-I??.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done #@ add de-duplication for i in *anno1.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/de-dup-clusters1.awk $i > $i.ded done sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed nmm 150887 57403 17974 31.312 56 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed nmm 150887 47320 15135 31.9844 44 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed nmm 150887 68119 21519 31.5903 79 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed nmm 150887 21295 6741 31.6553 36 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed nmm 150887 33981 10860 31.959 33 #get genomic coords, keep anno for i in *anno1.bed.ded do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic2.bed" done #bookend merge for i in *.anno1.bed.ded.genomic2.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8,9 -o collapse,max,distinct,collapse,collapse,collapse > foo > $i".collapsed" done #assign exon vs intron based on majority, exon for ties for i in *.anno1.bed.ded.genomic2.bed.collapsed do awk -f ../per-gene-exon-intron-stats1a.awk $i > $i.exon-intron done # double gene anno for same region in exon-intron.log3, manual check of >5 pairs = > no exon/intron assignment conflict # get best cluster in collapsed for i in *.anno1.bed.ded.genomic2.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster2.awk $i > $i".best.bed" done # intersect exon-intron-anno with transcript clusters for i in *.anno1.bed.ded.genomic2.bed.collapsed do #bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s > $i.anno2 bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 #bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 done for i in *.anno1.bed.ded.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="E"){print $0}}' $i > $i.exon done for i in *.anno1.bed.ded.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="I"){print $0}}' $i > $i.intron wc -l !$ #awk '{if ($NF=="I"){print $0}}' $i|sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8 -o collapse,collapse,collapse,collapse,collapse > $i.intron #wc !$ #same done 36739 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron 30582 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron 43077 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron 13621 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron 21801 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron (for i in *anno2 do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8 -o collapse,collapse,collapse,collapse,collapse > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8 -o collapse,collapse,collapse,collapse,collapse > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8 -o collapse,collapse,collapse,collapse,collapse > $i.cds.bed wc -l !$ done ) for i in *.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done for i in *.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon do bedtools intersect -v -a $i -b /data/results/reference/mmu/cds.bed /data/results/reference/mmu/5utr.bed /data/results/reference/mmu/3utr.bed -wa -s |sort -k1,1 -k2,2n > $i.ncRNA wc -l !$ done 3633 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA 3071 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA 4274 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA 1348 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA 2191 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA awk -v f3="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed awk -v f3="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed awk -v f3="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed awk -v f3="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed awk -v f3="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed #@ add 2nd class 'horizontal' annotation 3/5/cds/introns for i in *f.flt.*anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done 10995 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 2230 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 10568 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 7777 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1266 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 8460 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 11643 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 2845 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 13144 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 3477 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 798 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 3965 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 5927 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1117 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 6657 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed for i in *f.flt*.anno1.bed.ded.genomic2.bed.collapsed.anno2.exon.ncRNA do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done #all zero, as expected for i in *f.flt*.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done 447 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.3utr.bed 501 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.5utr.bed 3198 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.cds.bed 384 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.3utr.bed 406 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.5utr.bed 2783 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.cds.bed 468 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.3utr.bed 588 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.5utr.bed 3634 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons.merged.maxAvg.bed.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.cds.bed 124 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.3utr.bed 178 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.5utr.bed 974 sh-clusters-IFN.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.cds.bed 215 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.3utr.bed 253 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.5utr.bed 1480 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.ded.genomic2.bed.collapsed.anno2.intron.cds.bed