Mercurial > repos > romaingred > pirna_pipeline
comparison bin/resize.pm @ 0:198009598544 draft
Uploaded
author | romaingred |
---|---|
date | Wed, 11 Oct 2017 09:57:58 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:198009598544 |
---|---|
1 package resize; | |
2 | |
3 use strict; | |
4 use warnings; | |
5 | |
6 use FindBin; | |
7 use lib $FindBin::Bin; | |
8 use Rcall qw ( histogram ); | |
9 | |
10 use Exporter; | |
11 our @ISA = qw( Exporter ); | |
12 our @EXPORT_OK = qw( &size_distribution ); | |
13 | |
14 sub size_distribution | |
15 { | |
16 my ( $fastq, $fastq_out, $dir, $min, $max ) = @_; | |
17 | |
18 my ( %fragments_size, %duplicates ) ; | |
19 my $num = size($min, $max, $fastq, $fastq_out, \%fragments_size, \%duplicates); | |
20 | |
21 my $png = $dir.'histogram.png'; | |
22 histogram(\%fragments_size, $png, $num); | |
23 | |
24 my $size = $dir.'reads_size.txt'; | |
25 | |
26 | |
27 my $pourcentage; | |
28 open my $o, '>', $size || die "cannot open $size $!\n"; | |
29 print $o "size\tnumber\tpercentage\n"; | |
30 foreach my $k (sort { $a <=> $b } keys %fragments_size ) | |
31 { | |
32 $pourcentage = $fragments_size{$k} / $num * 100; | |
33 | |
34 print $o "$k\t$fragments_size{$k}\t"; | |
35 printf $o "%.2f\n",$pourcentage; | |
36 } | |
37 close $o; | |
38 | |
39 my $dup = $dir.'duplicates.txt' ; | |
40 open $o, '>', $dup || die "cannot open $size $!\n"; | |
41 print $o "size\tnumber\n"; | |
42 foreach my $k (sort { $duplicates{$b} <=> $duplicates{$a} } keys %duplicates ) | |
43 { | |
44 print $o "$k\t$duplicates{$k}\n"; | |
45 } | |
46 close $o; | |
47 } | |
48 | |
49 sub size | |
50 { | |
51 my ($min, $max, $in_file, $out_file, $sizeHashR, $duplicateHashR) = @_; | |
52 my ($numreads, $size, $cmp, $ok, $line) = (0, 0, 0, 0); | |
53 my @fastq; | |
54 open (my $in, $in_file) || die "cannot open $in_file $!\n"; | |
55 open (my $out, ">".$out_file) || die "cannot create $out_file $!\n"; | |
56 while(<$in>) | |
57 { | |
58 chomp $_; | |
59 $cmp++; $line++; | |
60 if ($cmp == 1) | |
61 { | |
62 die "file do not contain a @ at line $line\n" unless ($_ =~ /^\@/ ); | |
63 $ok = 0; @fastq = (); | |
64 push(@fastq,$_); | |
65 } | |
66 elsif ($cmp == 2) | |
67 { | |
68 #die "unrecognized symbol at line $line\n" unless ($_ =~ /[atcgATCGnN]+/ || $_ =~ /^$/ ); | |
69 push(@fastq,$_); | |
70 $size = length($_); | |
71 if ($size >= $min && $size <= $max) | |
72 { | |
73 $numreads++; | |
74 ${$sizeHashR}{$size}+=1; | |
75 ${$duplicateHashR}{$_}+=1 if (defined($duplicateHashR)); | |
76 $ok = 1; | |
77 } | |
78 } | |
79 elsif ($cmp == 3 ) | |
80 { | |
81 die "file do not contain a + at line $line\n" unless $_ =~ /^\+/; | |
82 push(@fastq,$_); | |
83 } | |
84 elsif ($cmp == 4 ) | |
85 { | |
86 push(@fastq,$_); | |
87 $cmp = 0; | |
88 if ($ok == 1) | |
89 { | |
90 foreach my $t (@fastq) | |
91 { | |
92 print $out $t."\n"; | |
93 } | |
94 } | |
95 } | |
96 } | |
97 close $in; close $out; | |
98 return $numreads; | |
99 } | |
100 | |
101 1; |