/data/results/tools/adapter/cutadapt-1.9.1/bin/cutadapt -q 10 -z -m 15 -c -a CGCCTTGGCCGTACAGCAG IL4/ugc_604_11_F3.csfasta IL4/ugc_604_11_F3.QV.qual >& IL4-q10-m15.log > IL4-q10-m15.fastq /data/results/tools/adapter/cutadapt-1.9.1/bin/cutadapt -q 20 -z -m 15 -c -a CGCCTTGGCCGTACAGCAG IL4/ugc_604_11_F3.csfasta IL4/ugc_604_11_F3.QV.qual >& IL4-q20-m15.log > IL4-q20-m15.fastq /data/results/tools/adapter/cutadapt-1.9.1/bin/cutadapt -q 25 -z -m 15 -c -a CGCCTTGGCCGTACAGCAG IL4/ugc_604_11_F3.csfasta IL4/ugc_604_11_F3.QV.qual >& IL4-q25-m15.log > IL4-q25-m15.fastq /data/results/tools/adapter/cutadapt-1.9.1/bin/cutadapt -q 30 -z -m 15 -c -a CGCCTTGGCCGTACAGCAG IL4/ugc_604_11_F3.csfasta IL4/ugc_604_11_F3.QV.qual >& IL4-q30-m15.log > IL4-q30-m15.fastq awk -f /data/results/tools/formats/fastq2fasta.awk IL4-q10-m15.fastq > IL4-q10-m15.fa $SHRIMP_FOLDER/bin/gmapper-cs -L /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/mm9-mRNA-introns1k-cs /data/images/proton/DKlab/mr/parclip/raw/IL4-q10-m15.fa -N 11 -n 1 --local -o 10 -v 20% -h 20% -r 30% -w 150% -e -255 -f -255 | samtools view -Sb - | samtools sort -o - - >IL4-q10-m15-15mMm1kIntron.bam 2>IL4-q10-m15-15mMm1kIntron-bam.log #without cutadapt: /data/images/proton/DKlab/mr/parclip/raw/IFNF-4p10.sam.bam.conversion.stats AtoC 132964 6.51976 GtoA 59637 2.92424 CtoA 244775 12.0023 TtoG 270082 13.2432 GtoC 91447 4.48401 AtoT 131781 6.46175 AtoG 169063 8.28984 GtoT 81503 3.99642 CtoT 311700 15.2839 CtoG 185016 9.07208 TtoA 188640 9.24977 TtoC 172793 8.47273 # read quality assessment: /data/images/proton/DKlab/mr/parclip/raw/pipeline1.sh /data/results/tools/solid/qc-filter/SOLiD_preprocess_filter_v2_MR1.pl -a on -o 0hrep1filt2 -f 0hrep1/ugc_604_1_F3.csfasta -g 0hrep1/ugc_604_1_F3.QV.qual wc 0hrep1/ugc_604_1_F3.csfasta 22967710 22967710 798859510 0hrep1/ugc_604_1_F3.csfasta wc 0hrep1filt_T_F3.csfasta 9835260 9835260 341609082 0hrep1filt_T_F3.csfasta # standart qual. filter removes >50% of reads # INDEX BUILDING # get ENSEMBL gtf annotation (for mm10) /data/results/reference/mmu/Mus_musculus.NCBIM37.64.gtf # lift coordinates to mm9 using https://genome.ucsc.edu/cgi-bin/hgLiftOver # prepare gtf for index building cd /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/ awk -f get-introns-max1k-v2.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64.gtf > Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.gtf # extract fasta seqs from whole genome using gtf /data/results/tools/align/cufflinks-2.2.1.Linux_x86_64/gffread -O -F -w Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa -g /data/results/reference/mmu/Mus_musculus/UCSC/mm9/Sequence/WholeGenomeFasta/genome.fa Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.gtf -M -d dupinfo.tr-with-1k-introns.txt # generate 2bit file needed for PARalyzer ln -s Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa mm9-mRNA-introns1k.fa /home/reczko/bin/faToTwoBit mm9-mRNA-introns1k.fa mm9-mRNA-introns1k.2bit #build index for shrimp export SHRIMP_FOLDER=/data/results/tools/align/SHRiMP_2_2_3 python /data/results/tools/align/SHRiMP_2_2_3/SHRiMP_2_2_3/utils/project-db.py --seed 010111111111111,101011111111111,110101111111111,111010111111111,111101011111111,111110101111111,111111010111111,111111101011111,111111110101111,111111111010111,111111111101011,111111111110101,111111111111010,111111111111101,111111111111111 --h-flag --shrimp-mode cs mm9-mRNA-introns1k.fa #prepare regions for annotation reczko@max:/data/results/reference/mmu$ awk -f get-ncRNA2.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf > ncRNA.bed rm intron.bed cds.bed 3utr.bed 5utr.bed ; awk -f /data/results/tools/rnaseq/get-3utr4.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf > /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.3utr # all exons (needed as some ncRNA transctipts (e.g. retained intron) do not pass as ncRNA): rm exon.bed ; awk -f get-exon1.awk /data/results/reference/mmu/Mus_musculus.NCBIM37.64-toMM9.gtf > exon.bed #ALIGNMENT # doc in /data/results/tools/align/SHRiMP_2_2_3/README.SHRiMP132 #one sample export SHRIMP_FOLDER=/data/results/tools/align/SHRiMP_2_2_3 $SHRIMP_FOLDER/bin/gmapper-cs -L /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/mm9-mRNA-introns1k-cs /data/images/proton/DKlab/mr/parclip/raw/2hrep2/*.csfasta -N 11 -n 1 --local -o 10 -v 20% -h 20% -r 30% -w 150% -e -255 -f -255 | samtools view -Sb - | samtools sort -o - - >2hrep2-15mMm1kIntron.bam 2>2hrep2-15mMm1kIntron-bam.log -N/--threads Number of Threads (default: 1) -n/--cmw-mode Match Mode (default: unpaired:2 paired:4) --local Perform local alignment (default: disabled) -o/--report Maximum Hits per Read (default: 10) -v/--vec-threshold SW Vector Hit Threshold (default: 47.00%) -h/--full-threshold SW Full Hit Threshold (default: 50.00%) -r/--cmw-threshold Window Generation Threshold (default: 55.00%) -w/--match-window Match Window Length (default: 140.00%) -e/--ext-r SW Gap Extend Score(Reference)(default: -7) -f/--ext-q SW Gap Extend Score (Query) (default: -3) #all samples: /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA1.sh /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA2.sh #alignment statistics in: /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA1.logB /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/shrimp-map-mRNA2.log #get mismatch stats /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/get-mismatch-stats1.sh #PARALYZER pre-processing #prepare flags in bam (input $i) and convert to bowtie format (output $i.md.bt): /data/images/proton/DKlab/mr/parclip/paralyzer/README-paralyzer15.txt for i in *-15mMm1kIntron.bam do echo $i # add MD flags for bowtie format /data/results/tools/samtools/samtools-1.3/samtools calmd -b $i /data/results/reference/mmu/mm9/mRNA-genomic-max-intron-15k/Mus_musculus.NCBIM37.64-toMM9.tr-with-1k-introns.fa >& /dev/null > foo.bam2 samtools view -h foo.bam2 | awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/sam2bowtie.awk2 > $i.md.bt mv foo.bam2 $i.md.bam done #PARALYZER #one sample: ./myp-100chunks.sh /data/images/proton/DKlab/mr/parclip/paralyzer/ini-files-1kIntron/sh.IGG.ini &> IGG.1kIntrons-log2 ./mergeMyPar-100chunks.sh sh-clusters-IGG.txt2-1kIntrons wc -l sh-clusters-IGG.txt2-1kIntrons.csv ./mergeMyPar-100chunks.sh sh-groups-IGG.txt2-1kIntrons ./mergeMyPar-100chunks.sh sh-distributions-IGG.txt2-1kIntrons #all samples: /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/myp-all-1kIntrons2.sh #post-processing # paralyzer to bed, more than 5 TtoC for i in sh-clusters-*.txt2-1kIntrons2.csv do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc.awk $i > $i.bed.gt5tc done for i in sh-clusters-*.txt2-1kIntrons2.csv do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2bed-gt5tc-plus.awk $i > $i.bed.gt5tc.plus wc $i.bed.gt5tc.plus done t=read.table("sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus",sep="\t") ms=t$V5 summary((ms)) modscore_analyis(ms,"IL4") #0.5086 0.7810 0.8472 0.8496 0.9235 1.0000 #modscore_cutoff for IL4 0.984 0.154163276322031 summary(t$V5>=0.984) #3251 write.table(t[t$V5>=0.984,],"sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt",row.names=F,col.names=F,quote=FALSE,sep="\t") t=read.table("sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus",sep="\t") ms=t$V5 summary((ms)) modscore_analyis(ms,"IGG") # 0.5661 0.7798 0.8439 0.8469 0.9174 1.0000 # [1] "modscore_cutoff for IGG 0.974 0.285225342489041" write.table(t[t$V5>=0.974,],"sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt",row.names=F,col.names=F,quote=FALSE,sep="\t") summary(t$V5>=0.974) #1205 #IGG removal for i in sh-clusters-*.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt do wc $i bedtools intersect -a $i -b sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt -v -s > $i.noIGG wc $i.noIGG done # 3251 19506 525188 sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt # 3219 19314 519834 sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG rm sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG #empty #add 5/3utr/cds info: for i in sh-clusters-*.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic.bed" done # add E/I , PCG anno awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG > sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed #get genomic coords, keep anno for i in sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic2.bed" done #bookend merge clusters overlapping in replicates for i in sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc.plus.flt.noIGG.anno1.bed.genomic2.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6 -o collapse,max,distinct > $i".collapsed" done for i in *f.flt*.genomic.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster1.awk $i > $i".best.bed" done # get info from distributions # /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk adds: ## 1..6(same) AvgConversionPct MaxConversionPct ConversionEventCount SdevConversionPct for i in sh-distributions-???*.txt2-1kIntrons2.csv do echo $i cat $i| awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-TtoC-conversionPct-noCutoff2.awk|awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa-dist2bed2.awk > $i".avg.csv3.bed" done # merge info from distributions and clusters bedtools intersect -a sh-clusters-IL4.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-IL4.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-IGG.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-IGG.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep1.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep1.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep2.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep2.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 bedtools intersect -a sh-clusters-0hrep3.txt2-1kIntrons2.csv.bed.gt5tc -b sh-distributions-0hrep3.txt2-1kIntrons2.csv.avg.csv3.bed -wa -wb -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/add_conversion_pct2.awk > sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 # separate plus strand results for i in sh-clusters-*1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 do wc $i awk -f ../filterPlusStrand.awk $i > $i.plus wc !$ done #1118143 6708858 197457040 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2 # 607183 3643098 107466384 sh-clusters-IL4.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus 224576 1347456 41122435 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2 126400 758400 23175703 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus (/ 126400 607183.0) 0.20817447128789837 # IGG separate # more than 4 reads and convEvPct > 5: # cp=1.0*nc/(nc+nnc); # if ((cp>0.05)&&(nr>=5)) {print $0;} awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads.awk sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus > sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.f # more than 4 reads: # and IGG removal: for i in sh-clusters-*1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus do wc $i bedtools intersect -a $i -b sh-clusters-IGG.txt2-1kIntrons.csv3.bed.gt5tc.gt0.25TtoC2.plus.f -v -s |awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_GE5reads.awk |sort -k1,1 -k2,2n > $i.noIGG.f wc !$ done 86718 520308 15500167 sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 81990 491940 14644251 sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 40699 244194 7204283 sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 38699 232194 6846831 sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 112849 677094 20301783 sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 109373 656238 19664891 sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 41427 248562 7313012 sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 28145 168870 4993566 sh-clusters-IGG.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f 126400 758400 23175703 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus 116449 698694 21331311 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f #IL4 check # adaptive filtering of modscore a) get cutoff #topuniform=max(which(lp8>0.05)); #highest modscore that is not uniformly distributed Rscript /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/get_modescore_hist_1kIntrons3_f.r [1] "modscore_cutoff for IFN 0.946 0.0591007877911048" ^^^^^ this is the modscore cutoff awk -v th=0.972 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt #IL4 check # 3819 22914 733905 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.979 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.97 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # 1334 8004 245720 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt awk -v th=0.964 -f /data/images/proton/DKlab/mr/parclip/paralyzer/filter_modescore1_keep_best.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # 4059 24354 765801 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt # add E/I , PCG anno awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed #IL4 check #get genomic coords, keep anno awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed #IL4 check for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done 472 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.bed 110 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.bed 587 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.bed #IL4 check for i in /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.3utr.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.3utr.bed > $i".3utr.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.5utr.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.5utr.bed > $i".5utr.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.cds.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.cds.bed > $i".cds.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.intron.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.intron.bed > $i".intron.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/ncRNA.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.ncRNA.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.ncRNA.bed > $i".ncRNA.best.bed" bedtools intersect -a $i -b /data/results/reference/mmu/exon.bed -wa -wb -s |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,10 -o distinct,distinct,distinct,collapse > $i.exon.bed wc -l !$ awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-modscore-collapsed-cluster1.awk $i.exon.bed > $i".exon.best.bed" done #461 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.bed #103 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.bed #566 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.bed #3302 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.bed #147 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.bed #1349 /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.bed # (+ 461 103 566 147) #1277 #IL4 check awk -v fe="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.best.bed" -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats3.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed 3 intron sense_intronic 3 intron non_coding 147 ncRNA 1 intron IG_V_gene 22 intron transcribed_unprocessed_pseudogene 15 3utr nonsense_mediated_decay 192 3utr protein_coding 3 5utr nonsense_mediated_decay 5 intron unprocessed_pseudogene 29 5utr protein_coding 4 intron transcribed_processed_pseudogene 3 cds nonsense_mediated_decay 175 cds protein_coding 62 intron antisense 321 intron nonsense_mediated_decay 2773 intron protein_coding 519 intron retained_intron 512 intron processed_transcript 3 intron pseudogene 32 other nonsense_mediated_decay 1st max: 3utr 21 5utr 3 cds 8 2nd max: 3utr 0 5utr 0 cds 0 344 other protein_coding 1st max: 3utr 182 5utr 35 cds 127 2nd max: 3utr 0 5utr 0 cds 23 4 intron ambiguous_orf 74 intron lincRNA # exons without CDS of e.g. retained_introns were not included, add all exons as category for this case: awk -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats2.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed chr4 44922000 44922104 ENSMUST00000067521_G6098.1_AAATAATGTCAGTAATTCAATATTGTCTTACTATTTAATACATTTTTGATGTATTAAATAGTACTGAGAGACCATTTCTTTTAATTATACAGTTATATAGCACAA_128_1231_5_9_1628_0.9819204955885362_1.000000_149_0.000000 1000 + ENSMUST00000142714|Zcchc7|nonsense_mediated_decay # cluster in 2 regions (all in ncRNA): awk '{i=++o[$4];t[$4,i]=$7"|"$8"|"$9;}END{for (i in o) {if (o[i]>1){print i;for (k=1;k<=o[i];k++){print t[i,k]}}}}' sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed > foo #use genomic_chr genomic_start genomic_end genomic_strand transcript_id as unique key awk -v fe="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.exon.best.bed" -v fn="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.ncRNA.best.bed" -v fi="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.intron.best.bed" -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.3utr.best.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.5utr.best.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.cds.best.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-stats5.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.per-gene.bed2 2 intron sense_intronic 2 intron non_coding 2 ncRNA snoRNA 1 ncRNA rRNA 9 3utr nonsense_mediated_decay 355 3utr protein_coding 1 intron IG_V_gene 5 cds nonsense_mediated_decay 19 intron transcribed_unprocessed_pseudogene 277 cds protein_coding 1 5utr nonsense_mediated_decay 5 intron unprocessed_pseudogene 49 5utr protein_coding 2 intron transcribed_processed_pseudogene 47 intron antisense 38 ncRNA processed_transcript 125 intron nonsense_mediated_decay 2221 intron protein_coding 1 ncRNA snRNA 251 intron retained_intron 274 intron processed_transcript 33 ncRNA pseudogene 1 ncRNA misc_RNA 2 intron pseudogene 44 ncRNA lincRNA 15 ncRNA miRNA 54 intron lincRNA Dear George, at /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed you can find results for IL4. Some genes might need to be collapsed, as there are only 3819 clusters this might not be a problem. I'm finishing collapsing (also for 0h) until tomorrow. BW, Martin #get genomic coords, keep anno awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed > /data/images/proton/DKlab/mr/parclip/paralyzer/PARalyzer_v1_1d/sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed # get best cluster in collapsed for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster2.awk $i > $i".best.bed" done #assign exon vs intron based on majority, exon for ties for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do awk -f ../per-gene-exon-intron-stats1a.awk $i > $i.exon-intron done # intersect exon-intron-anno with transcript clusters for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed do bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2 do awk '{if ($NF=="E"){print $0}}' $i > $i.exon done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2 do awk '{if ($NF=="I"){print $0}}' $i > $i.intron wc -l !$ done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done for i in sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon do bedtools intersect -v -a $i -b /data/results/reference/mmu/cds.bed /data/results/reference/mmu/5utr.bed /data/results/reference/mmu/3utr.bed -wa -s |sort -k1,1 -k2,2n > $i.ncRNA wc -l !$ done #here awk -v f3="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.3utr.bed" -v f5="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.5utr.bed" -v fc="sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed (# filter protein_coding_genes and add annotation from /data/results/reference/mmu/mm9/1kIntrons2-stranded/Mus_musculus.NCBIM37.64-toMM9.headers for i in *txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron1.awk $i |sort -k1,1 -k4,4 -k2,2n -k5,5n > $i.anno1.csv done ) for i in sh*rep*1kIntrons2*f.flt do cat $i | sort -k1,1 -k2,2n > $i.srt done bedtools multiinter -i sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 bedtools multiinter -i sh-clusters-2hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-2hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 bedtools multiinter -i sh-clusters-6hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt sh-clusters-6hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 ## 2of2 #add paralyzer details to intersection: awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed awk -v f1="sh-clusters-2hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-2hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed awk -v f1="sh-clusters-6hrep1.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-6hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-2of2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed #merge book-ended regions: awk -f ../get-merged-multiintersect-details1.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed awk -f ../get-merged-multiintersect-details1.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed #get max scoring cluster, add max avgConvPct as score awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed awk -f ../get-max-intersect-score2.awk sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.bed > sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.intersection-all.bed3-1kIntrons2.merged.maxAvg.bed (for i in *1kI*flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks2.awk $i > $i".genomic.bed" done for i in *genomic.bed do echo $i sort -k1,1 -k2,2n $i | awk -f /data/images/proton/DKlab/mr/parclip/tracks/correct-bed.awk > $i.srt; done ) awk -v f1="sh-clusters-0hrep2.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -v f2="sh-clusters-0hrep3.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.srt" -f ../get-multiintersect-details-1of2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.intersection-all.bed3-1kIntrons2 > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.bed awk -f ../get-max-intersect-score2.awk sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.bed > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed # add E/I , PCG anno for i in *.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done for i in sh-clusters-I??.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt do echo $i awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/pa2annotate-ExonIntron2.awk $i > $i.anno1.bed done #get genomic coords, keep anno for i in *anno1.bed do echo $i awk -f /data/images/proton/DKlab/mr/parclip/shrimp-mRNA-introns1k/tracks/paralyzer2tracks3.awk $i > $i".genomic2.bed" done #bookend merge for i in *.anno1.bed.genomic2.bed do cat $i |sort -k1,1 -k2,2n |bedtools merge -i - -s -c 4,5,6,7,8,9 -o collapse,max,distinct,collapse,collapse,collapse > foo > $i".collapsed" done # get best cluster in collapsed for i in *.anno1.bed.genomic2.bed.collapsed do awk -f /data/images/proton/DKlab/mr/parclip/paralyzer/get-best-collapsed-cluster2.awk $i > $i".best.bed" done # intersect exon-intron-anno with transcript clusters for i in *.anno1.bed.genomic2.bed.collapsed do bedtools intersect -a $i.exon-intron -b $i.best.bed -wa -wb -s |awk 'BEGIN{OFS="\t"}{print $8,$9,$10,$11,$12,$13,$14,$7}' > $i.anno2 done for i in *.anno1.bed.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="E"){print $0}}' $i > $i.exon done for i in *.anno1.bed.genomic2.bed.collapsed.anno2 do awk '{if ($NF=="I"){print $0}}' $i > $i.intron wc -l !$ done for i in *.anno1.bed.genomic2.bed.collapsed.anno2.exon do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done for i in *.anno1.bed.genomic2.bed.collapsed.anno2.exon do bedtools intersect -v -a $i -b /data/results/reference/mmu/cds.bed /data/results/reference/mmu/5utr.bed /data/results/reference/mmu/3utr.bed -wa -s |sort -k1,1 -k2,2n > $i.ncRNA wc -l !$ done #2here #merge per-gene parts awk -v f3="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.3utr.bed" -v f5="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.5utr.bed" -v fc="sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.cds.bed" -f /data/images/proton/DKlab/mr/parclip/paralyzer/per-gene-exon-intron-stats1.awk > sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed #recollect per-gene for i in *f.flt.*anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done for i in *f.flt*.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/intron.bed -wa -u -s |sort -k1,1 -k2,2n > $i.intron.bed wc -l !$ done #all zero, as expected for i in *f.flt*.anno1.bed.genomic2.bed.collapsed.anno2.intron do bedtools intersect -a $i -b /data/results/reference/mmu/3utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.3utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/5utr.bed -wa -u -s |sort -k1,1 -k2,2n > $i.5utr.bed wc -l !$ bedtools intersect -a $i -b /data/results/reference/mmu/cds.bed -wa -u -s |sort -k1,1 -k2,2n > $i.cds.bed wc -l !$ done sh-clusters-0h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-2h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-6h.gt0.25TtoC2.plus.noIGG.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.flt.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.flt.anno.bed.genomic2.bed.collapsed.best.per-gene-exons.bed 2nd annotation, clusters in PCGs: 7398 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1732 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 8133 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 9720 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 5917 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 1050 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 6960 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 7776 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 8438 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 2296 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 10468 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 11794 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 2640 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 663 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 3175 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 3612 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 4213 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.3utr.bed 884 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.5utr.bed 5079 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.cds.bed 5674 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.best.per-gene-exons.bed.intron.bed 2nd annotation, ncRNA clusters overlapping with introns: 2523 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2084 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2985 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 940 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 1543 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.exon.ncRNA.intron.bed 2nd annotation, clusters in introns (both PGC/ncRNA): 292 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 360 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2193 sh-clusters-0h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 291 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 286 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2034 sh-clusters-2h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 329 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 426 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 2569 sh-clusters-6h.gt0.25TtoC2.plus.noIGG.f.flt.srt.union.bed3-1kIntrons2.merged.maxAvg.bed.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 94 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 149 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 752 sh-clusters-IFN.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed 167 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.3utr.bed 202 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.5utr.bed 1137 sh-clusters-IL4.txt2-1kIntrons2.csv3.bed.gt5tc.gt0.25TtoC2.plus.noIGG.f.flt.anno1.bed.genomic2.bed.collapsed.anno2.intron.cds.bed