Mercurial > repos > geert-vandeweyer > fastq_all_to_sanger
comparison FastQ_QualConverter.pl @ 0:c450731486c8 draft
Uploaded
| author | geert-vandeweyer |
|---|---|
| date | Thu, 13 Feb 2014 08:24:33 -0500 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:c450731486c8 |
|---|---|
| 1 #!/usr/bin/perl | |
| 2 | |
| 3 # load modules | |
| 4 use Getopt::Std; | |
| 5 use File::Basename; | |
| 6 | |
| 7 ########## | |
| 8 ## opts ## | |
| 9 ########## | |
| 10 ## input files | |
| 11 # i : path to (i)nput fastq file | |
| 12 ## output files | |
| 13 # o : (o)utput file | |
| 14 ## optional options | |
| 15 # f : (f)ormat of input fastq file, default : detect automatically. | |
| 16 getopts('i:o:f:', \%opts) ; | |
| 17 | |
| 18 ## check existence of input file | |
| 19 if (!exists($opts{'i'}) || !-e $opts{'i'}) { | |
| 20 die('input file not found'); | |
| 21 } | |
| 22 | |
| 23 | |
| 24 | |
| 25 ############################ | |
| 26 ## Construct Quality hash ## | |
| 27 ############################ | |
| 28 my $qstring = '!"#$%&\'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'; | |
| 29 my @asci = ('') x 33; | |
| 30 my @qvalues = split('',$qstring); | |
| 31 push(@asci, @qvalues); | |
| 32 | |
| 33 ############################# | |
| 34 ## INPUT FILE SCORE FORMAT ## | |
| 35 ############################# | |
| 36 my $informat = ''; | |
| 37 my $validate = 0; | |
| 38 if (exists($opts{'f'}) && $opts{'f'} ne 'Auto') { | |
| 39 $informat = $opts{'f'}; | |
| 40 } | |
| 41 else { | |
| 42 $validate = 1; ## track during conversion to make sure. | |
| 43 my %tracker = (); | |
| 44 my $sanger = '!"#$%&\'()*+,-./0123456789:'; # items unique to sanger. | |
| 45 ## scan input file until clear. | |
| 46 open IN, $opts{'i'}; | |
| 47 $try = 0; | |
| 48 while ($try < 15000) { | |
| 49 $try ++; | |
| 50 my $name = <IN>; | |
| 51 my $seq = <IN>; | |
| 52 my $dump = <IN>; | |
| 53 my $qual = <IN>; | |
| 54 chomp($qual); | |
| 55 for ($i = 0; $i < length($qual); $i++) { | |
| 56 my $char = quotemeta(substr($qual,$i,1)); | |
| 57 if ($sanger =~ m/$char/) { | |
| 58 $informat = 'sanger'; | |
| 59 $try = 50000; | |
| 60 print "Item ($char) found in :\n $qual\n"; | |
| 61 last; | |
| 62 } | |
| 63 $tracker{$char} = 1; | |
| 64 } | |
| 65 } | |
| 66 close IN; | |
| 67 if ($informat eq '' ) { | |
| 68 ## no sanger (or ALL reads should be completely above qual=26) | |
| 69 ## check solexa items | |
| 70 foreach (qw/;<=>?/) { | |
| 71 if (exists($tracker{$_})) { | |
| 72 $informat = 'solexa'; | |
| 73 last; | |
| 74 } | |
| 75 } | |
| 76 } | |
| 77 if ($informat eq '') { | |
| 78 ## both illumina 1.3 and 1.5 are phred+64. | |
| 79 $informat = 'illumina'; | |
| 80 } | |
| 81 } | |
| 82 | |
| 83 | |
| 84 print "Input format detected : $informat\n"; | |
| 85 | |
| 86 ## SET CONVERSION RULE to sanger | |
| 87 my @conv_table; | |
| 88 if ($informat eq 'solexa') { | |
| 89 ## advanced conversion table | |
| 90 foreach (@qvalues) { | |
| 91 my $Q = 10 * log(1 + 10 ** (ord($_) - 64) / 10.0) / log(10); # solexa => phred | |
| 92 $conv_table[ord($_)] = $asci[$Q + 33]; # phred + 33 = sanger | |
| 93 } | |
| 94 } | |
| 95 elsif ($informat ne 'sanger') { | |
| 96 ## easy conversion table | |
| 97 foreach (@qvalues) { | |
| 98 $conv_table[ord($_)] = $asci[ord($_) - 31]; # base64 - 31 = sanger | |
| 99 } | |
| 100 } | |
| 101 else { | |
| 102 foreach (@qvalues) { | |
| 103 $conv_table[ord($_)] = $asci[ord($_)]; # sanger = sanger | |
| 104 } | |
| 105 } | |
| 106 | |
| 107 if ($informat eq 'sanger') { | |
| 108 system("cp -f ".$opts{'i'}." ".$opts{'o'}); | |
| 109 exit; | |
| 110 } | |
| 111 ###################### | |
| 112 ## CONVERT THE FILE ## | |
| 113 ###################### | |
| 114 open IN, $opts{'i'}; | |
| 115 open OUT, ">".$opts{'o'}; | |
| 116 my $counter = 0; | |
| 117 my $outstring = ''; | |
| 118 while ($readname = <IN>) { | |
| 119 $counter++; | |
| 120 my $seq = <IN>; | |
| 121 my $readnamebis = <IN>; | |
| 122 my $qstring = <IN>; | |
| 123 chomp($qstring); | |
| 124 my $outqual = ''; | |
| 125 foreach $ascval (unpack("C*",$qstring)) { # => returns the asci value of each item in string | |
| 126 $outqual .= $conv_table[$ascval]; | |
| 127 } | |
| 128 $outstring .= "$readname$seq$readnamebis$outqual\n"; | |
| 129 if ($counter > 500000) { # print per 500K reads. | |
| 130 print OUT $outstring; | |
| 131 $outstring = ''; | |
| 132 $counter = 0; | |
| 133 } | |
| 134 | |
| 135 } | |
| 136 print OUT $outstring; | |
| 137 close IN; | |
| 138 close OUT; |
