0
|
1 package resize;
|
|
2
|
|
3 use strict;
|
|
4 use warnings;
|
|
5
|
|
6 use FindBin;
|
|
7 use lib $FindBin::Bin;
|
|
8 use Rcall qw ( histogram );
|
|
9
|
|
10 use Exporter;
|
|
11 our @ISA = qw( Exporter );
|
|
12 our @EXPORT_OK = qw( &size_distribution );
|
|
13
|
|
14 sub size_distribution
|
|
15 {
|
|
16 my ( $fastq, $fastq_out, $dir, $min, $max ) = @_;
|
|
17
|
|
18 my ( %fragments_size, %duplicates ) ;
|
|
19 my $num = size($min, $max, $fastq, $fastq_out, \%fragments_size, \%duplicates);
|
|
20
|
|
21 my $png = $dir.'histogram.png';
|
|
22 histogram(\%fragments_size, $png, $num);
|
|
23
|
|
24 my $size = $dir.'reads_size.txt';
|
|
25
|
|
26
|
|
27 my $pourcentage;
|
|
28 open my $o, '>', $size || die "cannot open $size $!\n";
|
|
29 print $o "size\tnumber\tpercentage\n";
|
|
30 foreach my $k (sort { $a <=> $b } keys %fragments_size )
|
|
31 {
|
|
32 $pourcentage = $fragments_size{$k} / $num * 100;
|
|
33
|
|
34 print $o "$k\t$fragments_size{$k}\t";
|
|
35 printf $o "%.2f\n",$pourcentage;
|
|
36 }
|
|
37 close $o;
|
|
38
|
|
39 my $dup = $dir.'duplicates.txt' ;
|
|
40 open $o, '>', $dup || die "cannot open $size $!\n";
|
|
41 print $o "size\tnumber\n";
|
|
42 foreach my $k (sort { $duplicates{$b} <=> $duplicates{$a} } keys %duplicates )
|
|
43 {
|
|
44 print $o "$k\t$duplicates{$k}\n";
|
|
45 }
|
|
46 close $o;
|
|
47 }
|
|
48
|
|
49 sub size
|
|
50 {
|
|
51 my ($min, $max, $in_file, $out_file, $sizeHashR, $duplicateHashR) = @_;
|
|
52 my ($numreads, $size, $cmp, $ok, $line) = (0, 0, 0, 0);
|
|
53 my @fastq;
|
|
54 open (my $in, $in_file) || die "cannot open $in_file $!\n";
|
|
55 open (my $out, ">".$out_file) || die "cannot create $out_file $!\n";
|
|
56 while(<$in>)
|
|
57 {
|
|
58 chomp $_;
|
|
59 $cmp++; $line++;
|
|
60 if ($cmp == 1)
|
|
61 {
|
|
62 die "file do not contain a @ at line $line\n" unless ($_ =~ /^\@/ );
|
|
63 $ok = 0; @fastq = ();
|
|
64 push(@fastq,$_);
|
|
65 }
|
|
66 elsif ($cmp == 2)
|
|
67 {
|
|
68 #die "unrecognized symbol at line $line\n" unless ($_ =~ /[atcgATCGnN]+/ || $_ =~ /^$/ );
|
|
69 push(@fastq,$_);
|
|
70 $size = length($_);
|
|
71 if ($size >= $min && $size <= $max)
|
|
72 {
|
|
73 $numreads++;
|
|
74 ${$sizeHashR}{$size}+=1;
|
|
75 ${$duplicateHashR}{$_}+=1 if (defined($duplicateHashR));
|
|
76 $ok = 1;
|
|
77 }
|
|
78 }
|
|
79 elsif ($cmp == 3 )
|
|
80 {
|
|
81 die "file do not contain a + at line $line\n" unless $_ =~ /^\+/;
|
|
82 push(@fastq,$_);
|
|
83 }
|
|
84 elsif ($cmp == 4 )
|
|
85 {
|
|
86 push(@fastq,$_);
|
|
87 $cmp = 0;
|
|
88 if ($ok == 1)
|
|
89 {
|
|
90 foreach my $t (@fastq)
|
|
91 {
|
|
92 print $out $t."\n";
|
|
93 }
|
|
94 }
|
|
95 }
|
|
96 }
|
|
97 close $in; close $out;
|
|
98 return $numreads;
|
|
99 }
|
|
100
|
|
101 1;
|