Mercurial > repos > siyuan > prada
comparison pyPRADA_1.2/tools/bwa-0.5.7-mh/solid2fastq.pl @ 0:acc2ca1a3ba4
Uploaded
author | siyuan |
---|---|
date | Thu, 20 Feb 2014 00:44:58 -0500 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:acc2ca1a3ba4 |
---|---|
1 #!/usr/bin/perl -w | |
2 | |
3 # Author: lh3 | |
4 # Note: Ideally, this script should be written in C. It is a bit slow at present. | |
5 # Also note that this script is different from the one contained in MAQ. | |
6 | |
7 use strict; | |
8 use warnings; | |
9 use Getopt::Std; | |
10 | |
11 my %opts; | |
12 my $version = '0.1.3'; | |
13 my $usage = qq{ | |
14 Usage: solid2fastq.pl <in.title> <out.prefix> | |
15 | |
16 Note: <in.title> is the string showed in the `# Title:' line of a | |
17 ".csfasta" read file. Then <in.title>F3.csfasta is read sequence | |
18 file and <in.title>F3_QV.qual is the quality file. If | |
19 <in.title>R3.csfasta is present, this script assumes reads are | |
20 paired; otherwise reads will be regarded as single-end. | |
21 | |
22 The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 | |
23 tag and `2' for F3. Usually you may want to use short <out.prefix> | |
24 to save diskspace. Long <out.prefix> also causes troubles to maq. | |
25 | |
26 }; | |
27 | |
28 getopts('', \%opts); | |
29 die($usage) if (@ARGV != 2); | |
30 my ($title, $pre) = @ARGV; | |
31 my (@fhr, @fhw); | |
32 my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); | |
33 my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; | |
34 if ($is_paired) { # paired end | |
35 for (0 .. 3) { | |
36 my $fn = "$title$fn_suff[$_]"; | |
37 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); | |
38 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); | |
39 } | |
40 open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo | |
41 open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; | |
42 open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; | |
43 my (@df, @dr); | |
44 @df = &read1(1); @dr = &read1(2); | |
45 while (@df && @dr) { | |
46 if ($df[0] eq $dr[0]) { # mate pair | |
47 print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; | |
48 @df = &read1(1); @dr = &read1(2); | |
49 } else { | |
50 if ($df[0] le $dr[0]) { | |
51 print {$fhw[2]} $df[1]; | |
52 @df = &read1(1); | |
53 } else { | |
54 print {$fhw[2]} $dr[1]; | |
55 @dr = &read1(2); | |
56 } | |
57 } | |
58 } | |
59 if (@df) { | |
60 print {$fhw[2]} $df[1]; | |
61 while (@df = &read1(1, $fhr[0], $fhr[1])) { | |
62 print {$fhw[2]} $df[1]; | |
63 } | |
64 } | |
65 if (@dr) { | |
66 print {$fhw[2]} $dr[1]; | |
67 while (@dr = &read1(2, $fhr[2], $fhr[3])) { | |
68 print {$fhw[2]} $dr[1]; | |
69 } | |
70 } | |
71 close($fhr[$_]) for (0 .. $#fhr); | |
72 close($fhw[$_]) for (0 .. $#fhw); | |
73 } else { # single end | |
74 for (0 .. 1) { | |
75 my $fn = "$title$fn_suff[$_]"; | |
76 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); | |
77 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); | |
78 } | |
79 open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; | |
80 my @df; | |
81 while (@df = &read1(1, $fhr[0], $fhr[1])) { | |
82 print {$fhw[2]} $df[1]; | |
83 } | |
84 close($fhr[$_]) for (0 .. $#fhr); | |
85 close($fhw[2]); | |
86 } | |
87 | |
88 sub read1 { | |
89 my $i = shift(@_); | |
90 my $j = ($i-1)<<1; | |
91 my ($key, $seq); | |
92 my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); | |
93 while (<$fhs>) { | |
94 my $t = <$fhq>; | |
95 if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { | |
96 $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines | |
97 die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); | |
98 my $name = "$pre:$1_$2_$3/$i"; | |
99 $_ = substr(<$fhs>, 2); | |
100 tr/0123./ACGTN/; | |
101 my $s = $_; | |
102 $_ = <$fhq>; | |
103 s/^(\d+)\s*//; | |
104 s/-1\b/0/eg; | |
105 s/(\d+)\s*/chr($1+33)/eg; | |
106 $seq = qq/\@$name\n$s+\n$_\n/; | |
107 last; | |
108 } | |
109 } | |
110 return defined($seq)? ($key, $seq) : (); | |
111 } |