comparison FastQ_QualConverter.pl @ 0:c450731486c8 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:24:33 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:c450731486c8
1 #!/usr/bin/perl
2
3 # load modules
4 use Getopt::Std;
5 use File::Basename;
6
7 ##########
8 ## opts ##
9 ##########
10 ## input files
11 # i : path to (i)nput fastq file
12 ## output files
13 # o : (o)utput file
14 ## optional options
15 # f : (f)ormat of input fastq file, default : detect automatically.
16 getopts('i:o:f:', \%opts) ;
17
18 ## check existence of input file
19 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
20 die('input file not found');
21 }
22
23
24
25 ############################
26 ## Construct Quality hash ##
27 ############################
28 my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
29 my @asci = ('') x 33;
30 my @qvalues = split('',$qstring);
31 push(@asci, @qvalues);
32
33 #############################
34 ## INPUT FILE SCORE FORMAT ##
35 #############################
36 my $informat = '';
37 my $validate = 0;
38 if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') {
39 $informat = $opts{'f'};
40 }
41 else {
42 $validate = 1; ## track during conversion to make sure.
43 my %tracker = ();
44 my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger.
45 ## scan input file until clear.
46 open IN, $opts{'i'};
47 $try = 0;
48 while ($try < 15000) {
49 $try ++;
50 my $name = <IN>;
51 my $seq = <IN>;
52 my $dump = <IN>;
53 my $qual = <IN>;
54 chomp($qual);
55 for ($i = 0; $i < length($qual); $i++) {
56 my $char = quotemeta(substr($qual,$i,1));
57 if ($sanger =~ m/$char/) {
58 $informat = 'sanger';
59 $try = 50000;
60 print "Item ($char) found in :\n $qual\n";
61 last;
62 }
63 $tracker{$char} = 1;
64 }
65 }
66 close IN;
67 if ($informat eq '' ) {
68 ## no sanger (or ALL reads should be completely above qual=26)
69 ## check solexa items
70 foreach (qw/;<=>?/) {
71 if (exists($tracker{$_})) {
72 $informat = 'solexa';
73 last;
74 }
75 }
76 }
77 if ($informat eq '') {
78 ## both illumina 1.3 and 1.5 are phred+64.
79 $informat = 'illumina';
80 }
81 }
82
83
84 print "Input format detected : $informat\n";
85
86 ## SET CONVERSION RULE to sanger
87 my @conv_table;
88 if ($informat eq 'solexa') {
89 ## advanced conversion table
90 foreach (@qvalues) {
91 my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred
92 $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger
93 }
94 }
95 elsif ($informat ne 'sanger') {
96 ## easy conversion table
97 foreach (@qvalues) {
98 $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger
99 }
100 }
101 else {
102 foreach (@qvalues) {
103 $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger
104 }
105 }
106
107 if ($informat eq 'sanger') {
108 system("cp -f ".$opts{'i'}." ".$opts{'o'});
109 exit;
110 }
111 ######################
112 ## CONVERT THE FILE ##
113 ######################
114 open IN, $opts{'i'};
115 open OUT, ">".$opts{'o'};
116 my $counter = 0;
117 my $outstring = '';
118 while ($readname = <IN>) {
119 $counter++;
120 my $seq = <IN>;
121 my $readnamebis = <IN>;
122 my $qstring = <IN>;
123 chomp($qstring);
124 my $outqual = '';
125 foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string
126 $outqual .= $conv_table[$ascval];
127 }
128 $outstring .= "$readname$seq$readnamebis$outqual\n";
129 if ($counter > 500000) { # print per 500K reads.
130 print OUT $outstring;
131 $outstring = '';
132 $counter = 0;
133 }
134
135 }
136 print OUT $outstring;
137 close IN;
138 close OUT;