annotate bwa-0.6.2/solid2fastq.pl @ 2:a294fbfcb1db draft default tip

Uploaded BWA
author ashvark
date Fri, 18 Jul 2014 07:55:59 -0400
parents dd1186b11b3b
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
1 #!/usr/bin/perl -w
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
2
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
3 # Author: lh3
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
4 # Note: Ideally, this script should be written in C. It is a bit slow at present.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
5 # Also note that this script is different from the one contained in MAQ.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
6
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
7 use strict;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
8 use warnings;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
9 use Getopt::Std;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
10
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
11 my %opts;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
12 my $version = '0.1.4';
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
13 my $usage = qq{
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
14 Usage: solid2fastq.pl <in.title> <out.prefix>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
15
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
16 Note: <in.title> is the string showed in the `# Title:' line of a
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
17 ".csfasta" read file. Then <in.title>F3.csfasta is read sequence
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
18 file and <in.title>F3_QV.qual is the quality file. If
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
19 <in.title>R3.csfasta is present, this script assumes reads are
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
20 paired; otherwise reads will be regarded as single-end.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
21
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
22 The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
23 tag and `2' for F3. Usually you may want to use short <out.prefix>
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
24 to save diskspace. Long <out.prefix> also causes troubles to maq.
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
25
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
26 };
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
27
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
28 getopts('', \%opts);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
29 die($usage) if (@ARGV != 2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
30 my ($title, $pre) = @ARGV;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
31 my (@fhr, @fhw);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
32 my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual');
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
33 my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
34 if ($is_paired) { # paired end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
35 for (0 .. 3) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
36 my $fn = "$title$fn_suff[$_]";
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
37 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
38 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
39 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
40 open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
41 open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
42 open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
43 my (@df, @dr);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
44 @df = &read1(1); @dr = &read1(2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
45 while (@df && @dr) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
46 if ($df[0] eq $dr[0]) { # mate pair
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
47 print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
48 @df = &read1(1); @dr = &read1(2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
49 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
50 if ($df[0] le $dr[0]) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
51 print {$fhw[2]} $df[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
52 @df = &read1(1);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
53 } else {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
54 print {$fhw[2]} $dr[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
55 @dr = &read1(2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
56 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
57 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
58 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
59 if (@df) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
60 print {$fhw[2]} $df[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
61 while (@df = &read1(1, $fhr[0], $fhr[1])) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
62 print {$fhw[2]} $df[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
63 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
64 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
65 if (@dr) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
66 print {$fhw[2]} $dr[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
67 while (@dr = &read1(2, $fhr[2], $fhr[3])) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
68 print {$fhw[2]} $dr[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
69 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
70 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
71 close($fhr[$_]) for (0 .. $#fhr);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
72 close($fhw[$_]) for (0 .. $#fhw);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
73 } else { # single end
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
74 for (0 .. 1) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
75 my $fn = "$title$fn_suff[$_]";
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
76 $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
77 open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n");
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
78 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
79 open($fhw[2], "|gzip >$pre.single.fastq.gz") || die;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
80 my @df;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
81 while (@df = &read1(1, $fhr[0], $fhr[1])) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
82 print {$fhw[2]} $df[1];
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
83 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
84 close($fhr[$_]) for (0 .. $#fhr);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
85 close($fhw[2]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
86 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
87
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
88 sub read1 {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
89 my $i = shift(@_);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
90 my $j = ($i-1)<<1;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
91 my ($key, $seq);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
92 my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
93 while (<$fhs>) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
94 my $t = <$fhq>;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
95 if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) {
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
96 $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
97 die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
98 my $name = "$pre:$1_$2_$3/$i";
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
99 $_ = substr(<$fhs>, 2);
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
100 tr/0123./ACGTN/;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
101 my $s = $_;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
102 $_ = <$fhq>;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
103 s/-1\b/0/eg;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
104 s/^(\d+)\s*//;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
105 s/(\d+)\s*/chr($1+33)/eg;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
106 $seq = qq/\@$name\n$s+\n$_\n/;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
107 last;
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
108 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
109 }
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
110 return defined($seq)? ($key, $seq) : ();
dd1186b11b3b Uploaded BWA
ashvark
parents:
diff changeset
111 }