Mercurial > repos > siyuan > prada
view pyPRADA_1.2/tools/bwa-0.5.7-mh/solid2fastq.pl @ 3:f17965495ec9 draft default tip
Uploaded
author | siyuan |
---|---|
date | Tue, 11 Mar 2014 12:14:01 -0400 |
parents | acc2ca1a3ba4 |
children |
line wrap: on
line source
#!/usr/bin/perl -w # Author: lh3 # Note: Ideally, this script should be written in C. It is a bit slow at present. # Also note that this script is different from the one contained in MAQ. use strict; use warnings; use Getopt::Std; my %opts; my $version = '0.1.3'; my $usage = qq{ Usage: solid2fastq.pl <in.title> <out.prefix> Note: <in.title> is the string showed in the `# Title:' line of a ".csfasta" read file. Then <in.title>F3.csfasta is read sequence file and <in.title>F3_QV.qual is the quality file. If <in.title>R3.csfasta is present, this script assumes reads are paired; otherwise reads will be regarded as single-end. The read name will be <out.prefix>:panel_x_y/[12] with `1' for R3 tag and `2' for F3. Usually you may want to use short <out.prefix> to save diskspace. Long <out.prefix> also causes troubles to maq. }; getopts('', \%opts); die($usage) if (@ARGV != 2); my ($title, $pre) = @ARGV; my (@fhr, @fhw); my @fn_suff = ('F3.csfasta', 'F3_QV.qual', 'R3.csfasta', 'R3_QV.qual'); my $is_paired = (-f "$title$fn_suff[2]" || -f "$title$fn_suff[2].gz")? 1 : 0; if ($is_paired) { # paired end for (0 .. 3) { my $fn = "$title$fn_suff[$_]"; $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); } open($fhw[0], "|gzip >$pre.read2.fastq.gz") || die; # this is NOT a typo open($fhw[1], "|gzip >$pre.read1.fastq.gz") || die; open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; my (@df, @dr); @df = &read1(1); @dr = &read1(2); while (@df && @dr) { if ($df[0] eq $dr[0]) { # mate pair print {$fhw[0]} $df[1]; print {$fhw[1]} $dr[1]; @df = &read1(1); @dr = &read1(2); } else { if ($df[0] le $dr[0]) { print {$fhw[2]} $df[1]; @df = &read1(1); } else { print {$fhw[2]} $dr[1]; @dr = &read1(2); } } } if (@df) { print {$fhw[2]} $df[1]; while (@df = &read1(1, $fhr[0], $fhr[1])) { print {$fhw[2]} $df[1]; } } if (@dr) { print {$fhw[2]} $dr[1]; while (@dr = &read1(2, $fhr[2], $fhr[3])) { print {$fhw[2]} $dr[1]; } } close($fhr[$_]) for (0 .. $#fhr); close($fhw[$_]) for (0 .. $#fhw); } else { # single end for (0 .. 1) { my $fn = "$title$fn_suff[$_]"; $fn = "gzip -dc $fn.gz |" if (!-f $fn && -f "$fn.gz"); open($fhr[$_], $fn) || die("** Fail to open '$fn'.\n"); } open($fhw[2], "|gzip >$pre.single.fastq.gz") || die; my @df; while (@df = &read1(1, $fhr[0], $fhr[1])) { print {$fhw[2]} $df[1]; } close($fhr[$_]) for (0 .. $#fhr); close($fhw[2]); } sub read1 { my $i = shift(@_); my $j = ($i-1)<<1; my ($key, $seq); my ($fhs, $fhq) = ($fhr[$j], $fhr[$j|1]); while (<$fhs>) { my $t = <$fhq>; if (/^>(\d+)_(\d+)_(\d+)_[FR]3/) { $key = sprintf("%.4d_%.4d_%.4d", $1, $2, $3); # this line could be improved on 64-bit machines die(qq/** unmatched read name: '$_' != '$_'\n/) unless ($_ eq $t); my $name = "$pre:$1_$2_$3/$i"; $_ = substr(<$fhs>, 2); tr/0123./ACGTN/; my $s = $_; $_ = <$fhq>; s/^(\d+)\s*//; s/-1\b/0/eg; s/(\d+)\s*/chr($1+33)/eg; $seq = qq/\@$name\n$s+\n$_\n/; last; } } return defined($seq)? ($key, $seq) : (); }