Mercurial > repos > geert-vandeweyer > fastq_all_to_sanger
comparison FastQ_QualConverter.pl @ 0:c450731486c8 draft
Uploaded
author | geert-vandeweyer |
---|---|
date | Thu, 13 Feb 2014 08:24:33 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c450731486c8 |
---|---|
1 #!/usr/bin/perl | |
2 | |
3 # load modules | |
4 use Getopt::Std; | |
5 use File::Basename; | |
6 | |
7 ########## | |
8 ## opts ## | |
9 ########## | |
10 ## input files | |
11 # i : path to (i)nput fastq file | |
12 ## output files | |
13 # o : (o)utput file | |
14 ## optional options | |
15 # f : (f)ormat of input fastq file, default : detect automatically. | |
16 getopts('i:o:f:', \%opts) ; | |
17 | |
18 ## check existence of input file | |
19 if (!exists($opts{'i'}) || !-e $opts{'i'}) { | |
20 die('input file not found'); | |
21 } | |
22 | |
23 | |
24 | |
25 ############################ | |
26 ## Construct Quality hash ## | |
27 ############################ | |
28 my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; | |
29 my @asci = ('') x 33; | |
30 my @qvalues = split('',$qstring); | |
31 push(@asci, @qvalues); | |
32 | |
33 ############################# | |
34 ## INPUT FILE SCORE FORMAT ## | |
35 ############################# | |
36 my $informat = ''; | |
37 my $validate = 0; | |
38 if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') { | |
39 $informat = $opts{'f'}; | |
40 } | |
41 else { | |
42 $validate = 1; ## track during conversion to make sure. | |
43 my %tracker = (); | |
44 my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger. | |
45 ## scan input file until clear. | |
46 open IN, $opts{'i'}; | |
47 $try = 0; | |
48 while ($try < 15000) { | |
49 $try ++; | |
50 my $name = <IN>; | |
51 my $seq = <IN>; | |
52 my $dump = <IN>; | |
53 my $qual = <IN>; | |
54 chomp($qual); | |
55 for ($i = 0; $i < length($qual); $i++) { | |
56 my $char = quotemeta(substr($qual,$i,1)); | |
57 if ($sanger =~ m/$char/) { | |
58 $informat = 'sanger'; | |
59 $try = 50000; | |
60 print "Item ($char) found in :\n $qual\n"; | |
61 last; | |
62 } | |
63 $tracker{$char} = 1; | |
64 } | |
65 } | |
66 close IN; | |
67 if ($informat eq '' ) { | |
68 ## no sanger (or ALL reads should be completely above qual=26) | |
69 ## check solexa items | |
70 foreach (qw/;<=>?/) { | |
71 if (exists($tracker{$_})) { | |
72 $informat = 'solexa'; | |
73 last; | |
74 } | |
75 } | |
76 } | |
77 if ($informat eq '') { | |
78 ## both illumina 1.3 and 1.5 are phred+64. | |
79 $informat = 'illumina'; | |
80 } | |
81 } | |
82 | |
83 | |
84 print "Input format detected : $informat\n"; | |
85 | |
86 ## SET CONVERSION RULE to sanger | |
87 my @conv_table; | |
88 if ($informat eq 'solexa') { | |
89 ## advanced conversion table | |
90 foreach (@qvalues) { | |
91 my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred | |
92 $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger | |
93 } | |
94 } | |
95 elsif ($informat ne 'sanger') { | |
96 ## easy conversion table | |
97 foreach (@qvalues) { | |
98 $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger | |
99 } | |
100 } | |
101 else { | |
102 foreach (@qvalues) { | |
103 $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger | |
104 } | |
105 } | |
106 | |
107 if ($informat eq 'sanger') { | |
108 system("cp -f ".$opts{'i'}." ".$opts{'o'}); | |
109 exit; | |
110 } | |
111 ###################### | |
112 ## CONVERT THE FILE ## | |
113 ###################### | |
114 open IN, $opts{'i'}; | |
115 open OUT, ">".$opts{'o'}; | |
116 my $counter = 0; | |
117 my $outstring = ''; | |
118 while ($readname = <IN>) { | |
119 $counter++; | |
120 my $seq = <IN>; | |
121 my $readnamebis = <IN>; | |
122 my $qstring = <IN>; | |
123 chomp($qstring); | |
124 my $outqual = ''; | |
125 foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string | |
126 $outqual .= $conv_table[$ascval]; | |
127 } | |
128 $outstring .= "$readname$seq$readnamebis$outqual\n"; | |
129 if ($counter > 500000) { # print per 500K reads. | |
130 print OUT $outstring; | |
131 $outstring = ''; | |
132 $counter = 0; | |
133 } | |
134 | |
135 } | |
136 print OUT $outstring; | |
137 close IN; | |
138 close OUT; |