annotate FastQ_QualConverter.pl @ 0:c450731486c8 draft

Uploaded
author geert-vandeweyer
date Thu, 13 Feb 2014 08:24:33 -0500
parents
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
1 #!/usr/bin/perl
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
2
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
3 # load modules
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
4 use Getopt::Std;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
5 use File::Basename;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
6
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
7 ##########
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
8 ## opts ##
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
9 ##########
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
10 ## input files
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
11 # i : path to (i)nput fastq file
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
12 ## output files
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
13 # o : (o)utput file
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
14 ## optional options
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
15 # f : (f)ormat of input fastq file, default : detect automatically.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
16 getopts('i:o:f:', \%opts) ;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
17
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
18 ## check existence of input file
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
19 if (!exists($opts{'i'}) || !-e $opts{'i'}) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
20 die('input file not found');
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
21 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
22
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
23
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
24
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
25 ############################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
26 ## Construct Quality hash ##
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
27 ############################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
28 my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
29 my @asci = ('') x 33;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
30 my @qvalues = split('',$qstring);
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
31 push(@asci, @qvalues);
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
32
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
33 #############################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
34 ## INPUT FILE SCORE FORMAT ##
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
35 #############################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
36 my $informat = '';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
37 my $validate = 0;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
38 if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
39 $informat = $opts{'f'};
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
40 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
41 else {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
42 $validate = 1; ## track during conversion to make sure.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
43 my %tracker = ();
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
44 my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
45 ## scan input file until clear.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
46 open IN, $opts{'i'};
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
47 $try = 0;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
48 while ($try < 15000) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
49 $try ++;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
50 my $name = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
51 my $seq = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
52 my $dump = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
53 my $qual = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
54 chomp($qual);
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
55 for ($i = 0; $i < length($qual); $i++) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
56 my $char = quotemeta(substr($qual,$i,1));
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
57 if ($sanger =~ m/$char/) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
58 $informat = 'sanger';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
59 $try = 50000;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
60 print "Item ($char) found in :\n $qual\n";
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
61 last;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
62 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
63 $tracker{$char} = 1;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
64 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
65 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
66 close IN;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
67 if ($informat eq '' ) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
68 ## no sanger (or ALL reads should be completely above qual=26)
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
69 ## check solexa items
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
70 foreach (qw/;<=>?/) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
71 if (exists($tracker{$_})) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
72 $informat = 'solexa';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
73 last;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
74 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
75 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
76 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
77 if ($informat eq '') {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
78 ## both illumina 1.3 and 1.5 are phred+64.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
79 $informat = 'illumina';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
80 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
81 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
82
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
83
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
84 print "Input format detected : $informat\n";
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
85
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
86 ## SET CONVERSION RULE to sanger
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
87 my @conv_table;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
88 if ($informat eq 'solexa') {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
89 ## advanced conversion table
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
90 foreach (@qvalues) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
91 my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
92 $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
93 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
94 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
95 elsif ($informat ne 'sanger') {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
96 ## easy conversion table
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
97 foreach (@qvalues) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
98 $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
99 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
100 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
101 else {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
102 foreach (@qvalues) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
103 $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
104 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
105 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
106
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
107 if ($informat eq 'sanger') {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
108 system("cp -f ".$opts{'i'}." ".$opts{'o'});
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
109 exit;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
110 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
111 ######################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
112 ## CONVERT THE FILE ##
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
113 ######################
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
114 open IN, $opts{'i'};
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
115 open OUT, ">".$opts{'o'};
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
116 my $counter = 0;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
117 my $outstring = '';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
118 while ($readname = <IN>) {
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
119 $counter++;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
120 my $seq = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
121 my $readnamebis = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
122 my $qstring = <IN>;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
123 chomp($qstring);
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
124 my $outqual = '';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
125 foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
126 $outqual .= $conv_table[$ascval];
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
127 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
128 $outstring .= "$readname$seq$readnamebis$outqual\n";
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
129 if ($counter > 500000) { # print per 500K reads.
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
130 print OUT $outstring;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
131 $outstring = '';
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
132 $counter = 0;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
133 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
134
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
135 }
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
136 print OUT $outstring;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
137 close IN;
c450731486c8 Uploaded
geert-vandeweyer
parents:
diff changeset
138 close OUT;