Mercurial > repos > petr-novak > repeatrxplorer
comparison bin/select_and_sort_contigs.pl @ 0:1d1b9e1b2e2f draft
Uploaded
| author | petr-novak |
|---|---|
| date | Thu, 19 Dec 2019 10:24:45 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:1d1b9e1b2e2f |
|---|---|
| 1 #!/usr/bin/env perl | |
| 2 | |
| 3 # Selects contigs exceeding the specified RD | |
| 4 # and sort them based on their properties | |
| 5 # which have to be provided in FASTA ID line: | |
| 6 # >CLXContigY (lenght-RD-lenght*RD) | |
| 7 # (RD = average Read Depth of the contig) | |
| 8 # (GR = genome representation (also marked as "pxd") = lenght*RD) | |
| 9 # | |
| 10 # ver. 03 - requires input filename {fn}.info.fasta | |
| 11 # - output saved to {fn}.info.minRDxxx.fasta | |
| 12 # | |
| 13 use warnings; | |
| 14 | |
| 15 $jm_contigs = $ARGV[0]; | |
| 16 if ($ARGV[1]) { | |
| 17 $min_pokryti = $ARGV[1]; | |
| 18 } else { | |
| 19 die "Missing argument - minRD\n"; | |
| 20 } | |
| 21 if ($jm_contigs =~/(\S+).fasta$/) { | |
| 22 $jm_vystup_zakladni = $1.".minRD".$min_pokryti; | |
| 23 } else { | |
| 24 $jm_vystup_zakladni = $jm_contigs.".minRD".$min_pokryti; | |
| 25 } | |
| 26 $jm_vystup_sort_RD = $jm_vystup_zakladni."_sort-RD.fasta"; # min. read depth | |
| 27 $jm_vystup_sort_delka = $jm_vystup_zakladni."_sort-length.fasta"; | |
| 28 $jm_vystup_sort_pxd = $jm_vystup_zakladni."_sort-GR.fasta"; | |
| 29 | |
| 30 %pokryti = (); # prum. pokryti contigu ctenimi | |
| 31 %delka_cont = (); # delka contigu (jeho consensu) | |
| 32 %pxd = (); # pxd = pokryti x delka | |
| 33 %sekvence = (); # sekvence contigu | |
| 34 | |
| 35 open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n"; | |
| 36 while ($radek = <CONT>) { | |
| 37 if ($radek =~/^>(CL\d+Contig\d+) \((\d+)\-(\S+)\-(\d+)\)/) { | |
| 38 if ($3>=$min_pokryti) { | |
| 39 # print "$1 : delka $2, prum. pokryti $3, pxd $4\n"; | |
| 40 $delka_cont{$1} = $2; | |
| 41 $pokryti{$1} = $3; | |
| 42 $pxd{$1} = $4; | |
| 43 } | |
| 44 } | |
| 45 } | |
| 46 close CONT; | |
| 47 | |
| 48 # ze souboru contigu se nactou sekvence tech, vybranych v predchozi fazi | |
| 49 open (CONT, $jm_contigs) or die "Cannot open $jm_contigs\n"; | |
| 50 $ukladat = 0; | |
| 51 while ($radek = <CONT>) { | |
| 52 if ($radek =~/>(CL\d+Contig\d+)/) { | |
| 53 if (exists($pokryti{$1})) { | |
| 54 $ukladat = 1; | |
| 55 $jmeno = $1; | |
| 56 } else { | |
| 57 $ukladat = 0; | |
| 58 } | |
| 59 } elsif ($ukladat) { | |
| 60 $sekvence{$jmeno} .= $radek; | |
| 61 } | |
| 62 } | |
| 63 close CONT; | |
| 64 | |
| 65 # vygeneruji se soubory, kde jsou contigy co prosly pres >= $min_pokryti setrideny | |
| 66 # podle pokryti, delky, nebo delky x pokryti | |
| 67 open (VYSTUP, ">$jm_vystup_sort_RD") or die "Cannot write to $jm_vystup_sort_RD\n"; | |
| 68 foreach $klic (sort {$pokryti{$b} <=> $pokryti{$a}} keys(%pokryti)) { | |
| 69 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; | |
| 70 print VYSTUP "$sekvence{$klic}"; | |
| 71 } | |
| 72 close VYSTUP; | |
| 73 open (VYSTUP, ">$jm_vystup_sort_delka") or die "Cannot write to $jm_vystup_sort_delka\n"; | |
| 74 foreach $klic (sort {$delka_cont{$b} <=> $delka_cont{$a}} keys(%delka_cont)) { | |
| 75 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; | |
| 76 print VYSTUP "$sekvence{$klic}"; | |
| 77 } | |
| 78 close VYSTUP; | |
| 79 open (VYSTUP, ">$jm_vystup_sort_pxd") or die "Cannot write to $jm_vystup_sort_pxd\n"; | |
| 80 foreach $klic (sort {$pxd{$b} <=> $pxd{$a}} keys(%pxd)) { | |
| 81 print VYSTUP ">$klic ($delka_cont{$klic}\-$pokryti{$klic}\-$pxd{$klic})\n"; | |
| 82 print VYSTUP "$sekvence{$klic}"; | |
| 83 } | |
| 84 close VYSTUP; | |
| 85 | |
| 86 $celkem_pxd = 0; | |
| 87 foreach $hodnota (values(%pxd)) { | |
| 88 # print "H: $hodnota\n"; | |
| 89 $celkem_pxd += $hodnota; | |
| 90 } | |
| 91 print "\nTotal GR: $celkem_pxd\n\n"; |
