annotate pyPRADA_1.2/tools/samtools-0.1.16/misc/export2sam.pl @ 3:f17965495ec9 draft default tip

Uploaded
author siyuan
date Tue, 11 Mar 2014 12:14:01 -0400
parents acc2ca1a3ba4
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
1 #!/usr/bin/env perl
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
2 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
3 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
4 # export2sam.pl converts GERALD export files to SAM format.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
5 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
6 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
7 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
8 ########## License:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
9 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
10 # The MIT License
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
11 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
12 # Original SAMtools work copyright (c) 2008-2009 Genome Research Ltd.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
13 # Modified SAMtools work copyright (c) 2010 Illumina, Inc.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
14 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
15 # Permission is hereby granted, free of charge, to any person obtaining a copy
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
16 # of this software and associated documentation files (the "Software"), to deal
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
17 # in the Software without restriction, including without limitation the rights
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
18 # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
19 # copies of the Software, and to permit persons to whom the Software is
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
20 # furnished to do so, subject to the following conditions:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
21 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
22 # The above copyright notice and this permission notice shall be included in
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
23 # all copies or substantial portions of the Software.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
24 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
25 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
26 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
27 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
28 # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
29 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
30 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
31 # THE SOFTWARE.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
32 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
33 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
34 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
35 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
36 ########## ChangeLog:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
37 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
38 # Version: 2.3.1 (18MAR2011)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
39 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
40 # - Restore file '-' as stdin input.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
41 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
42 # Version: 2.3.0 (24JAN2011)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
43 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
44 # - Add support for export reserved chromosome name "CONTROL",
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
45 # which is translated to optional field "XC:Z:CONTROL".
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
46 # - Check for ".gz" file extension on export files and open
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
47 # these as gzip pipes when the extension is found.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
48 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
49 # Version: 2.2.0 (16NOV2010)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
50 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
51 # - Remove any leading zeros in export fields: RUNNO,LANE,TILE,X,Y
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
52 # - For export records with reserved chromosome name identifiers
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
53 # "QC" and "RM", add the optional field "XC:Z:QC" or "XC:Z:RM"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
54 # to the SAM record, so that these cases can be distinguished
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
55 # from other unmatched reads.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
56 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
57 # Version: 2.1.0 (21SEP2010)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
58 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
59 # - Additional export record error checking.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
60 # - Convert export records with chromomsome value of "RM" to unmapped
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
61 # SAM records.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
62 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
63 # Version: 2.0.0 (15FEB2010)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
64 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
65 # Script updated by Illumina in conjunction with CASAVA 1.7.0
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
66 # release.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
67 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
68 # Major changes are as follows:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
69 # - The CIGAR string has been updated to include all gaps from
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
70 # ELANDv2 alignments.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
71 # - The ELAND single read alignment score is always stored in the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
72 # optional "SM" field and the ELAND paired read alignment score
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
73 # is stored in the optional "AS" field when it exists.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
74 # - The MAPQ value is set to the higher of the two alignment scores,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
75 # but no greater than 254, i.e. min(254,max(SM,AS))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
76 # - The SAM "proper pair" bit (0x0002) is now set for read pairs
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
77 # meeting ELAND's expected orientation and insert size criteria.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
78 # - The default quality score translation is set for export files
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
79 # which contain Phread+64 quality values. An option,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
80 # "--qlogodds", has been added to translate quality values from
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
81 # the Solexa+64 format used in export files prior to Pipeline
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
82 # 1.3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
83 # - The export match descriptor is now reverse-complemented when
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
84 # necessary such that it always corresponds to the forward
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
85 # strand of the reference, to be consistent with other
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
86 # information in the SAM record. It is now written to the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
87 # optional 'XD' field (rather than 'MD') to acknowledge its
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
88 # minor differences from the samtools match descriptor (see
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
89 # additional detail below).
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
90 # - An option, "--nofilter", has been added to include reads which
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
91 # have failed primary analysis quality filtration. Such reads
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
92 # will have the corresponding SAM flag bit (0x0200) set.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
93 # - Labels in the export 'contig' field are preserved by setting
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
94 # RNAME to "$export_chromosome/$export_contig" when the contig
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
95 # label exists.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
96 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
97 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
98 # Contact: lh3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
99 # Version: 0.1.2 (03JAN2009)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
100 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
101 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
102 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
103 ########## Known Conversion Limitations:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
104 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
105 # - Export records for reads that map to a position < 1 (allowed
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
106 # in export format), are converted to unmapped reads in the SAM
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
107 # record.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
108 # - Export records contain the reserved chromosome names: "NM",
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
109 # "QC","RM" and "CONTROL". "NM" indicates that the aligner could
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
110 # not map the read to the reference sequence set. "QC" means that
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
111 # the aligner did not attempt to map the read due to some
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
112 # technical limitation. "RM" means that the read mapped to a set
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
113 # of 'contaminant' sequences specified in GERALD's RNA-seq
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
114 # workflow. "CONTROL" means that the read is a control. All of
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
115 # these alignment types are collapsed to the single unmapped
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
116 # alignment state in the SAM record, but the optional SAM "XC"
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
117 # field is used to record the original reserved chromosome name of
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
118 # the read for all but the "NM" case.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
119 # - The export match descriptor is slightly different than the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
120 # samtools match descriptor. For this reason it is stored in the
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
121 # optional SAM field 'XD' (and not 'MD'). Note that the export
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
122 # match descriptor differs from the samtools version in two
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
123 # respects: (1) indels are explicitly closed with the '$'
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
124 # character and (2) insertions must be enumerated in the match
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
125 # descriptor. For example a 35-base read with a two-base insertion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
126 # is described as: 20^2$14
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
127 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
128 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
129 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
130
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
131 my $version = "2.3.1";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
132
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
133 use strict;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
134 use warnings;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
135
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
136 use Getopt::Long;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
137 use File::Spec;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
138 use List::Util qw(min max);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
139
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
140
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
141 use constant {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
142 EXPORT_MACHINE => 0,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
143 EXPORT_RUNNO => 1,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
144 EXPORT_LANE => 2,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
145 EXPORT_TILE => 3,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
146 EXPORT_X => 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
147 EXPORT_Y => 5,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
148 EXPORT_INDEX => 6,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
149 EXPORT_READNO => 7,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
150 EXPORT_READ => 8,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
151 EXPORT_QUAL => 9,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
152 EXPORT_CHROM => 10,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
153 EXPORT_CONTIG => 11,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
154 EXPORT_POS => 12,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
155 EXPORT_STRAND => 13,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
156 EXPORT_MD => 14,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
157 EXPORT_SEMAP => 15,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
158 EXPORT_PEMAP => 16,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
159 EXPORT_PASSFILT => 21,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
160 EXPORT_SIZE => 22,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
161 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
162
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
163
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
164 use constant {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
165 SAM_QNAME => 0,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
166 SAM_FLAG => 1,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
167 SAM_RNAME => 2,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
168 SAM_POS => 3,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
169 SAM_MAPQ => 4,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
170 SAM_CIGAR => 5,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
171 SAM_MRNM => 6,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
172 SAM_MPOS => 7,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
173 SAM_ISIZE => 8,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
174 SAM_SEQ => 9,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
175 SAM_QUAL => 10,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
176 };
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
177
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
178
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
179 # function prototypes for Richard's code
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
180 sub match_desc_to_cigar($);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
181 sub match_desc_frag_length($);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
182 sub reverse_compl_match_descriptor($);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
183 sub write_header($;$;$);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
184
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
185
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
186 &export2sam;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
187 exit;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
188
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
189
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
190
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
191
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
192 sub export2sam {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
193
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
194 my $cmdline = $0 . " " . join(" ",@ARGV);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
195 my $arg_count = scalar @ARGV;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
196 my $progname = (File::Spec->splitpath($0))[2];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
197
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
198 my $is_logodds_qvals = 0; # if true, assume files contain logodds (i.e. "solexa") quality values
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
199 my $is_nofilter = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
200 my $read1file;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
201 my $read2file;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
202 my $print_version = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
203 my $help = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
204
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
205 my $result = GetOptions( "qlogodds" => \$is_logodds_qvals,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
206 "nofilter" => \$is_nofilter,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
207 "read1=s" => \$read1file,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
208 "read2=s" => \$read2file,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
209 "version" => \$print_version,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
210 "help" => \$help );
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
211
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
212 my $usage = <<END;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
213
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
214 $progname converts GERALD export files to SAM format.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
215
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
216 Usage: $progname --read1=FILENAME [ options ] | --version | --help
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
217
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
218 --read1=FILENAME read1 export file or '-' for stdin (mandatory)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
219 (file may be gzipped with ".gz" extension)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
220 --read2=FILENAME read2 export file or '-' for stdin
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
221 (file may be gzipped with ".gz" extension)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
222 --nofilter include reads that failed the basecaller
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
223 purity filter
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
224 --qlogodds assume export file(s) use logodds quality values
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
225 as reported by OLB (Pipeline) prior to v1.3
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
226 (default: phred quality values)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
227
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
228 END
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
229
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
230 my $version_msg = <<END;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
231
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
232 $progname version: $version
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
233
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
234 END
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
235
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
236 if((not $result) or $help or ($arg_count==0)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
237 die($usage);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
238 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
239
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
240 if(@ARGV) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
241 print STDERR "\nERROR: Unrecognized arguments: " . join(" ",@ARGV) . "\n\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
242 die($usage);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
243 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
244
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
245 if($print_version) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
246 die($version_msg);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
247 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
248
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
249 if(not defined($read1file)) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
250 print STDERR "\nERROR: read1 export file must be specified\n\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
251 die($usage);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
252 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
253
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
254 unless((-f $read1file) or ($read1file eq '-')) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
255 die("\nERROR: Can't find read1 export file: '$read1file'\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
256 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
257
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
258 if (defined $read2file) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
259 unless((-f $read2file) or ($read2file eq '-')) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
260 die("\nERROR: Can't find read2 export file: '$read2file'\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
261 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
262 if($read1file eq $read2file) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
263 die("\nERROR: read1 and read2 export filenames are the same: '$read1file'\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
264 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
265 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
266
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
267 my ($fh1, $fh2, $is_paired);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
268
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
269 my $read1cmd="$read1file";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
270 $read1cmd = "gzip -dc $read1file |" if($read1file =~ /\.gz$/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
271 open($fh1, $read1cmd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
272 or die("\nERROR: Can't open read1 process: '$read1cmd'\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
273 $is_paired = defined $read2file;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
274 if ($is_paired) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
275 my $read2cmd="$read2file";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
276 $read2cmd = "gzip -dc $read2file |" if($read2file =~ /\.gz$/);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
277 open($fh2, $read2cmd)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
278 or die("\nERROR: Can't open read2 process: '$read2cmd'\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
279 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
280 # quality value conversion table
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
281 my @conv_table;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
282 if($is_logodds_qvals){ # convert from solexa+64 quality values (pipeline pre-v1.3):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
283 for (-64..64) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
284 $conv_table[$_+64] = int(33 + 10*log(1+10**($_/10.0))/log(10)+.499);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
285 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
286 } else { # convert from phred+64 quality values (pipeline v1.3+):
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
287 for (-64..-1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
288 $conv_table[$_+64] = undef;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
289 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
290 for (0..64) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
291 $conv_table[$_+64] = int(33 + $_);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
292 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
293 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
294 # write the header
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
295 print write_header( $progname, $version, $cmdline );
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
296 # core loop
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
297 my $export_line_count = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
298 while (<$fh1>) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
299 $export_line_count++;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
300 my (@s1, @s2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
301 &export2sam_aux($_, $export_line_count, \@s1, \@conv_table, $is_paired, 1, $is_nofilter);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
302 if ($is_paired) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
303 my $read2line = <$fh2>;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
304 if(not $read2line){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
305 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read1 file at line no: $export_line_count.\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
306 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
307 &export2sam_aux($read2line, $export_line_count, \@s2, \@conv_table, $is_paired, 2, $is_nofilter);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
308
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
309 if (@s1 && @s2) { # then set mate coordinate
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
310 if($s1[SAM_QNAME] ne $s2[SAM_QNAME]){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
311 die("\nERROR: Non-paired reads in export files on line: $export_line_count.\n Read1: $_ Read2: $read2line\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
312 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
313
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
314 my $isize = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
315 if ($s1[SAM_RNAME] ne '*' && $s1[SAM_RNAME] eq $s2[SAM_RNAME]) { # then calculate $isize
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
316 my $x1 = ($s1[SAM_FLAG] & 0x10)? $s1[SAM_POS] + length($s1[SAM_SEQ]) : $s1[SAM_POS];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
317 my $x2 = ($s2[SAM_FLAG] & 0x10)? $s2[SAM_POS] + length($s2[SAM_SEQ]) : $s2[SAM_POS];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
318 $isize = $x2 - $x1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
319 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
320
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
321 foreach ([\@s1,\@s2,$isize],[\@s2,\@s1,-$isize]){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
322 my ($sa,$sb,$is) = @{$_};
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
323 if ($sb->[SAM_RNAME] ne '*') {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
324 $sa->[SAM_MRNM] = ($sb->[SAM_RNAME] eq $sa->[SAM_RNAME]) ? "=" : $sb->[SAM_RNAME];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
325 $sa->[SAM_MPOS] = $sb->[SAM_POS];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
326 $sa->[SAM_ISIZE] = $is;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
327 $sa->[SAM_FLAG] |= 0x20 if ($sb->[SAM_FLAG] & 0x10);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
328 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
329 $sa->[SAM_FLAG] |= 0x8;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
330 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
331 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
332 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
333 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
334 print join("\t", @s1), "\n" if (@s1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
335 print join("\t", @s2), "\n" if (@s2 && $is_paired);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
336 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
337 close($fh1);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
338 if($is_paired) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
339 while(my $read2line = <$fh2>){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
340 $export_line_count++;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
341 die("\nERROR: read1 and read2 export files do not contain the same number of reads.\n Extra reads observed in read2 file at line no: $export_line_count.\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
342 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
343 close($fh2);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
344 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
345 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
346
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
347 sub export2sam_aux {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
348 my ($line, $line_no, $s, $ct, $is_paired, $read_no, $is_nofilter) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
349 chomp($line);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
350 my @t = split("\t", $line);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
351 if(scalar(@t) < EXPORT_SIZE) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
352 my $msg="\nERROR: Unexpected number of fields in export record on line $line_no of read$read_no export file. Found " . scalar(@t) . " fields but expected " . EXPORT_SIZE . ".\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
353 $msg.="\t...erroneous export record:\n" . $line . "\n\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
354 die($msg);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
355 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
356 @$s = ();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
357 my $isPassFilt = ($t[EXPORT_PASSFILT] eq 'Y');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
358 return if(not ($isPassFilt or $is_nofilter));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
359 # read name
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
360 my $samQnamePrefix = $t[EXPORT_MACHINE] . (($t[EXPORT_RUNNO] ne "") ? "_" . int($t[EXPORT_RUNNO]) : "");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
361 $s->[SAM_QNAME] = join(':', $samQnamePrefix, int($t[EXPORT_LANE]), int($t[EXPORT_TILE]),
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
362 int($t[EXPORT_X]), int($t[EXPORT_Y]));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
363 # initial flag (will be updated later)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
364 $s->[SAM_FLAG] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
365 if($is_paired) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
366 if($t[EXPORT_READNO] != $read_no){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
367 die("\nERROR: read$read_no export file contains record with read number: " .$t[EXPORT_READNO] . " on line: $line_no\n\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
368 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
369 $s->[SAM_FLAG] |= 1 | 1<<(5 + $read_no);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
370 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
371 $s->[SAM_FLAG] |= 0x200 if (not $isPassFilt);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
372
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
373 # read & quality
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
374 my $is_export_rev = ($t[EXPORT_STRAND] eq 'R');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
375 if ($is_export_rev) { # then reverse the sequence and quality
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
376 $s->[SAM_SEQ] = reverse($t[EXPORT_READ]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
377 $s->[SAM_SEQ] =~ tr/ACGTacgt/TGCAtgca/;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
378 $s->[SAM_QUAL] = reverse($t[EXPORT_QUAL]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
379 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
380 $s->[SAM_SEQ] = $t[EXPORT_READ];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
381 $s->[SAM_QUAL] = $t[EXPORT_QUAL];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
382 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
383 my @convqual = ();
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
384 foreach (unpack('C*', $s->[SAM_QUAL])){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
385 my $val=$ct->[$_];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
386 if(not defined $val){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
387 my $msg="\nERROR: can't interpret export quality value: " . $_ . " in read$read_no export file, line: $line_no\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
388 if( $_ < 64 ) { $msg .= " Use --qlogodds flag to translate logodds (solexa) quality values.\n"; }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
389 die($msg . "\n");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
390 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
391 push @convqual,$val;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
392 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
393
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
394 $s->[SAM_QUAL] = pack('C*',@convqual); # change coding
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
395
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
396
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
397 # coor
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
398 my $has_coor = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
399 $s->[SAM_RNAME] = "*";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
400 if (($t[EXPORT_CHROM] eq 'NM') or
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
401 ($t[EXPORT_CHROM] eq 'QC') or
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
402 ($t[EXPORT_CHROM] eq 'RM') or
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
403 ($t[EXPORT_CHROM] eq 'CONTROL')) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
404 $s->[SAM_FLAG] |= 0x4; # unmapped
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
405 push(@$s,"XC:Z:".$t[EXPORT_CHROM]) if($t[EXPORT_CHROM] ne 'NM');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
406 } elsif ($t[EXPORT_CHROM] =~ /(\d+):(\d+):(\d+)/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
407 $s->[SAM_FLAG] |= 0x4; # TODO: should I set BAM_FUNMAP in this case?
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
408 push(@$s, "H0:i:$1", "H1:i:$2", "H2:i:$3")
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
409 } elsif ($t[EXPORT_POS] < 1) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
410 $s->[SAM_FLAG] |= 0x4; # unmapped
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
411 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
412 $s->[SAM_RNAME] = $t[EXPORT_CHROM];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
413 $s->[SAM_RNAME] .= "/" . $t[EXPORT_CONTIG] if($t[EXPORT_CONTIG] ne '');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
414 $has_coor = 1;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
415 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
416 $s->[SAM_POS] = $has_coor? $t[EXPORT_POS] : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
417
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
418 # print STDERR "t[14] = " . $t[14] . "\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
419 my $matchDesc = '';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
420 $s->[SAM_CIGAR] = "*";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
421 if($has_coor){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
422 $matchDesc = ($is_export_rev) ? reverse_compl_match_descriptor($t[EXPORT_MD]) : $t[EXPORT_MD];
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
423
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
424 if($matchDesc =~ /\^/){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
425 # construct CIGAR string using Richard's function
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
426 $s->[SAM_CIGAR] = match_desc_to_cigar($matchDesc); # indel processing
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
427 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
428 $s->[SAM_CIGAR] = length($s->[SAM_SEQ]) . "M";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
429 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
430 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
431
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
432 # print STDERR "cigar_string = $cigar_string\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
433
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
434 $s->[SAM_FLAG] |= 0x10 if ($has_coor && $is_export_rev);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
435 if($has_coor){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
436 my $semap = ($t[EXPORT_SEMAP] ne '') ? $t[EXPORT_SEMAP] : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
437 my $pemap = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
438 if($is_paired) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
439 $pemap = ($t[EXPORT_PEMAP] ne '') ? $t[EXPORT_PEMAP] : 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
440
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
441 # set `proper pair' bit if non-blank, non-zero PE alignment score:
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
442 $s->[SAM_FLAG] |= 0x02 if ($pemap > 0);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
443 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
444 $s->[SAM_MAPQ] = min(254,max($semap,$pemap));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
445 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
446 $s->[SAM_MAPQ] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
447 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
448 # mate coordinate
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
449 $s->[SAM_MRNM] = '*';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
450 $s->[SAM_MPOS] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
451 $s->[SAM_ISIZE] = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
452 # aux
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
453 push(@$s, "BC:Z:$t[EXPORT_INDEX]") if ($t[EXPORT_INDEX]);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
454 if($has_coor){
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
455 # The export match descriptor differs slightly from the samtools match descriptor.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
456 # In order for the converted SAM files to be as compliant as possible,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
457 # we put the export match descriptor in optional field 'XD' rather than 'MD':
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
458 push(@$s, "XD:Z:$matchDesc");
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
459 push(@$s, "SM:i:$t[EXPORT_SEMAP]") if ($t[EXPORT_SEMAP] ne '');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
460 push(@$s, "AS:i:$t[EXPORT_PEMAP]") if ($is_paired and ($t[EXPORT_PEMAP] ne ''));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
461 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
462 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
463
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
464
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
465
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
466 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
467 # the following code is taken from Richard Shaw's sorted2sam.pl file
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
468 #
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
469 sub reverse_compl_match_descriptor($)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
470 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
471 # print "\nREVERSING THE MATCH DESCRIPTOR!\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
472 my ($match_desc) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
473 my $rev_compl_match_desc = reverse($match_desc);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
474 $rev_compl_match_desc =~ tr/ACGT\^\$/TGCA\$\^/;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
475
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
476 # Unreverse the digits of numbers.
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
477 $rev_compl_match_desc = join('',
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
478 map {($_ =~ /\d+/)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
479 ? join('', reverse(split('', $_)))
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
480 : $_} split(/(\d+)/,
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
481 $rev_compl_match_desc));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
482
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
483 return $rev_compl_match_desc;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
484 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
485
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
486
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
487
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
488 sub match_desc_to_cigar($)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
489 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
490 my ($match_desc) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
491
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
492 my @match_desc_parts = split(/(\^.*?\$)/, $match_desc);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
493 my $cigar_str = '';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
494 my $cigar_del_ch = 'D';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
495 my $cigar_ins_ch = 'I';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
496 my $cigar_match_ch = 'M';
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
497
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
498 foreach my $match_desc_part (@match_desc_parts) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
499 next if (!$match_desc_part);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
500
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
501 if ($match_desc_part =~ /^\^([ACGTN]+)\$$/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
502 # Deletion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
503 $cigar_str .= (length($1) . $cigar_del_ch);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
504 } elsif ($match_desc_part =~ /^\^(\d+)\$$/) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
505 # Insertion
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
506 $cigar_str .= ($1 . $cigar_ins_ch);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
507 } else {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
508 $cigar_str .= (match_desc_frag_length($match_desc_part)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
509 . $cigar_match_ch);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
510 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
511 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
512
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
513 return $cigar_str;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
514 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
515
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
516
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
517 #------------------------------------------------------------------------------
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
518
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
519 sub match_desc_frag_length($)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
520 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
521 my ($match_desc_str) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
522 my $len = 0;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
523
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
524 my @match_desc_fields = split(/([ACGTN]+)/, $match_desc_str);
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
525
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
526 foreach my $match_desc_field (@match_desc_fields) {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
527 next if ($match_desc_field eq '');
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
528
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
529 $len += (($match_desc_field =~ /(\d+)/)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
530 ? $1 : length($match_desc_field));
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
531 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
532
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
533 return $len;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
534 }
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
535
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
536
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
537 # argument holds the command line
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
538 sub write_header($;$;$)
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
539 {
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
540 my ($progname,$version,$cl) = @_;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
541 my $complete_header = "";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
542 $complete_header .= "\@PG\tID:$progname\tVN:$version\tCL:$cl\n";
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
543
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
544 return $complete_header;
acc2ca1a3ba4 Uploaded
siyuan
parents:
diff changeset
545 }