Mercurial > repos > iuc > extract_genomic_dna
comparison extract_genomic_dna.py @ 11:80414c33a59a draft default tip
planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit 6db2d98b513e4980788fcba49d809c91e5750296
author | iuc |
---|---|
date | Thu, 21 Nov 2024 07:20:29 +0000 |
parents | e400dcbc60d0 |
children |
comparison
equal
deleted
inserted
replaced
10:5cc8e93ee98f | 11:80414c33a59a |
---|---|
9 from bx.intervals.io import Comment, Header | 9 from bx.intervals.io import Comment, Header |
10 | 10 |
11 import extract_genomic_dna_utils as egdu # noqa: I100,I202 | 11 import extract_genomic_dna_utils as egdu # noqa: I100,I202 |
12 | 12 |
13 parser = argparse.ArgumentParser() | 13 parser = argparse.ArgumentParser() |
14 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") | 14 parser.add_argument("--input_format", dest="input_format", help="Input dataset format") |
15 parser.add_argument('--input', dest='input', help="Input dataset") | 15 parser.add_argument("--input", dest="input", help="Input dataset") |
16 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") | 16 parser.add_argument("--genome", dest="genome", help="Input dataset genome build") |
17 parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") | 17 parser.add_argument( |
18 parser.add_argument('--columns', dest='columns', help="Columns to use in input file") | 18 "--interpret_features", |
19 parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") | 19 dest="interpret_features", |
20 parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file") | 20 default=None, |
21 parser.add_argument('--output_format', dest='output_format', help="Output format") | 21 help="Interpret features if input format is gff", |
22 parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format") | 22 ) |
23 parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter") | 23 parser.add_argument("--columns", dest="columns", help="Columns to use in input file") |
24 parser.add_argument('--output', dest='output', help="Output dataset") | 24 parser.add_argument( |
25 "--reference_genome_source", | |
26 dest="reference_genome_source", | |
27 help="Source of reference genome file", | |
28 ) | |
29 parser.add_argument( | |
30 "--reference_genome", dest="reference_genome", help="Reference genome file" | |
31 ) | |
32 parser.add_argument("--output_format", dest="output_format", help="Output format") | |
33 parser.add_argument( | |
34 "--fasta_header_type", | |
35 dest="fasta_header_type", | |
36 default=None, | |
37 help="Fasta header format", | |
38 ) | |
39 parser.add_argument( | |
40 "--fasta_header_delimiter", | |
41 dest="fasta_header_delimiter", | |
42 default=None, | |
43 help="Fasta header field delimiter", | |
44 ) | |
45 parser.add_argument("--output", dest="output", help="Output dataset") | |
25 args = parser.parse_args() | 46 args = parser.parse_args() |
26 | 47 |
27 input_is_gff = args.input_format == 'gff' | 48 input_is_gff = args.input_format == "gff" |
28 interpret_features = input_is_gff and args.interpret_features == "yes" | 49 interpret_features = input_is_gff and args.interpret_features == "yes" |
29 if len(args.columns.split(',')) == 5: | 50 if len(args.columns.split(",")) == 5: |
30 # Bed file. | 51 # Bed file. |
31 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) | 52 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg( |
53 args.columns | |
54 ) | |
32 else: | 55 else: |
33 # Gff file. | 56 # Gff file. |
34 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) | 57 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) |
35 name_col = False | 58 name_col = False |
36 | 59 |
45 nibs = {} | 68 nibs = {} |
46 skipped_lines = 0 | 69 skipped_lines = 0 |
47 first_invalid_line = 0 | 70 first_invalid_line = 0 |
48 invalid_lines = [] | 71 invalid_lines = [] |
49 warnings = [] | 72 warnings = [] |
50 warning = '' | 73 warning = "" |
51 twobitfile = None | 74 twobitfile = None |
52 line_count = 1 | 75 line_count = 1 |
53 file_iterator = open(args.input) | 76 file_iterator = open(args.input) |
54 if interpret_features: | 77 if interpret_features: |
55 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) | 78 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) |
56 out = open(args.output, 'wt') | 79 out = open(args.output, "wt") |
57 | 80 |
58 for feature in file_iterator: | 81 for feature in file_iterator: |
59 # Ignore comments, headers. | 82 # Ignore comments, headers. |
60 if isinstance(feature, (Header, Comment)): | 83 if isinstance(feature, (Header, Comment)): |
61 line_count += 1 | 84 line_count += 1 |
68 start = feature.start | 91 start = feature.start |
69 end = feature.end | 92 end = feature.end |
70 strand = feature.strand | 93 strand = feature.strand |
71 else: | 94 else: |
72 # Processing lines, either interval or GFF format. | 95 # Processing lines, either interval or GFF format. |
73 line = feature.rstrip('\r\n') | 96 line = feature.rstrip("\r\n") |
74 if line and not line.startswith("#"): | 97 if line and not line.startswith("#"): |
75 fields = line.split('\t') | 98 fields = line.split("\t") |
76 try: | 99 try: |
77 chrom = fields[chrom_col] | 100 chrom = fields[chrom_col] |
78 start = int(fields[start_col]) | 101 start = int(fields[start_col]) |
79 end = int(fields[end_col]) | 102 end = int(fields[end_col]) |
80 if name_col: | 103 if name_col: |
97 if not invalid_lines: | 120 if not invalid_lines: |
98 invalid_lines = egdu.get_lines(feature) | 121 invalid_lines = egdu.get_lines(feature) |
99 first_invalid_line = line_count | 122 first_invalid_line = line_count |
100 skipped_lines += len(invalid_lines) | 123 skipped_lines += len(invalid_lines) |
101 continue | 124 continue |
102 if strand not in ['+', '-']: | 125 if strand not in ["+", "-"]: |
103 strand = '+' | 126 strand = "+" |
104 sequence = '' | 127 sequence = "" |
105 else: | 128 else: |
106 continue | 129 continue |
107 # Open sequence file and get sequence for feature/interval. | 130 # Open sequence file and get sequence for feature/interval. |
108 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): | 131 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): |
109 if chrom in nibs: | 132 if chrom in nibs: |
110 nib = nibs[chrom] | 133 nib = nibs[chrom] |
111 else: | 134 else: |
112 nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) | 135 nibs[chrom] = nib = bx.seq.nib.NibFile( |
136 open("%s/%s.nib" % (seq_path, chrom)) | |
137 ) | |
113 try: | 138 try: |
114 sequence = nib.get(start, end - start) | 139 sequence = nib.get(start, end - start) |
115 except Exception as e: | 140 except Exception: |
116 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) | 141 warning = ( |
142 "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " | |
143 % (start, end - start, args.genome) | |
144 ) | |
117 warnings.append(warning) | 145 warnings.append(warning) |
118 if not invalid_lines: | 146 if not invalid_lines: |
119 invalid_lines = egdu.get_lines(feature) | 147 invalid_lines = egdu.get_lines(feature) |
120 first_invalid_line = line_count | 148 first_invalid_line = line_count |
121 skipped_lines += len(invalid_lines) | 149 skipped_lines += len(invalid_lines) |
122 continue | 150 continue |
123 elif os.path.isfile(seq_path): | 151 elif os.path.isfile(seq_path): |
124 if not(twobitfile): | 152 if not (twobitfile): |
125 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) | 153 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) |
126 try: | 154 try: |
127 if interpret_features: | 155 if interpret_features: |
128 # Create sequence from intervals within a feature. | 156 # Create sequence from intervals within a feature. |
129 sequence = '' | 157 sequence = "" |
130 for interval in feature.intervals: | 158 for interval in feature.intervals: |
131 sequence += twobitfile[interval.chrom][interval.start:interval.end] | 159 sequence += twobitfile[interval.chrom][ |
160 interval.start: interval.end | |
161 ] | |
132 else: | 162 else: |
133 sequence = twobitfile[chrom][start:end] | 163 sequence = twobitfile[chrom][start:end] |
134 except Exception: | 164 except Exception: |
135 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) | 165 warning = ( |
166 "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " | |
167 % (start, end - start, chrom) | |
168 ) | |
136 warnings.append(warning) | 169 warnings.append(warning) |
137 if not invalid_lines: | 170 if not invalid_lines: |
138 invalid_lines = egdu.get_lines(feature) | 171 invalid_lines = egdu.get_lines(feature) |
139 first_invalid_line = line_count | 172 first_invalid_line = line_count |
140 skipped_lines += len(invalid_lines) | 173 skipped_lines += len(invalid_lines) |
141 continue | 174 continue |
142 else: | 175 else: |
143 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) | 176 warning = "Chromosome by name '%s' was not found for build '%s'. " % ( |
177 chrom, | |
178 args.genome, | |
179 ) | |
144 warnings.append(warning) | 180 warnings.append(warning) |
145 if not invalid_lines: | 181 if not invalid_lines: |
146 invalid_lines = egdu.get_lines(feature) | 182 invalid_lines = egdu.get_lines(feature) |
147 first_invalid_line = line_count | 183 first_invalid_line = line_count |
148 skipped_lines += len(invalid_lines) | 184 skipped_lines += len(invalid_lines) |
149 continue | 185 continue |
150 if sequence == '': | 186 if sequence == "": |
151 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) | 187 warning = ( |
188 "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " | |
189 % (chrom, start, end, args.genome) | |
190 ) | |
152 warnings.append(warning) | 191 warnings.append(warning) |
153 if not invalid_lines: | 192 if not invalid_lines: |
154 invalid_lines = egdu.get_lines(feature) | 193 invalid_lines = egdu.get_lines(feature) |
155 first_invalid_line = line_count | 194 first_invalid_line = line_count |
156 skipped_lines += len(invalid_lines) | 195 skipped_lines += len(invalid_lines) |
159 sequence = egdu.reverse_complement(sequence) | 198 sequence = egdu.reverse_complement(sequence) |
160 if args.output_format == "fasta": | 199 if args.output_format == "fasta": |
161 if input_is_gff: | 200 if input_is_gff: |
162 start, end = egdu.convert_bed_coords_to_gff([start, end]) | 201 start, end = egdu.convert_bed_coords_to_gff([start, end]) |
163 if args.fasta_header_type == "bedtools_getfasta_default": | 202 if args.fasta_header_type == "bedtools_getfasta_default": |
164 out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom), | 203 out.write( |
165 str(start), | 204 ">%s\n" |
166 str(end), | 205 % egdu.get_bedtools_getfasta_default_header( |
167 strand, | 206 str(chrom), str(start), str(end), strand, includes_strand_col |
168 includes_strand_col)) | 207 ) |
208 ) | |
169 else: | 209 else: |
170 # args.fasta_header_type == "char_delimited": | 210 # args.fasta_header_type == "char_delimited": |
171 fields = [args.genome, str(chrom), str(start), str(end), strand] | 211 fields = [args.genome, str(chrom), str(start), str(end), strand] |
172 field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter) | 212 field_delimiter = egdu.get_fasta_header_delimiter( |
213 args.fasta_header_delimiter | |
214 ) | |
173 meta_data = field_delimiter.join(fields) | 215 meta_data = field_delimiter.join(fields) |
174 if name.strip(): | 216 if name.strip(): |
175 out.write(">%s %s\n" % (meta_data, name)) | 217 out.write(">%s %s\n" % (meta_data, name)) |
176 else: | 218 else: |
177 out.write(">%s\n" % meta_data) | 219 out.write(">%s\n" % meta_data) |
182 out.write("%s\n" % str(sequence[c:b])) | 224 out.write("%s\n" % str(sequence[c:b])) |
183 c = b | 225 c = b |
184 else: | 226 else: |
185 # output_format == "interval". | 227 # output_format == "interval". |
186 if interpret_features: | 228 if interpret_features: |
187 meta_data = "\t".join([feature.chrom, | 229 meta_data = "\t".join( |
188 "galaxy_extract_genomic_dna", | 230 [ |
189 "interval", | 231 feature.chrom, |
190 str(feature.start), | 232 "galaxy_extract_genomic_dna", |
191 str(feature.end), | 233 "interval", |
192 feature.score, | 234 str(feature.start), |
193 feature.strand, | 235 str(feature.end), |
194 ".", | 236 feature.score, |
195 egdu.gff_attributes_to_str(feature.attributes, "GTF")]) | 237 feature.strand, |
238 ".", | |
239 egdu.gff_attributes_to_str(feature.attributes, "GTF"), | |
240 ] | |
241 ) | |
196 else: | 242 else: |
197 # Here fields was set up around line 73. | 243 # Here fields was set up around line 73. |
198 meta_data = "\t".join(fields) | 244 meta_data = "\t".join(fields) |
199 if input_is_gff: | 245 if input_is_gff: |
200 format_str = "%s seq \"%s\";\n" | 246 format_str = '%s seq "%s";\n' |
201 else: | 247 else: |
202 format_str = "%s\t%s\n" | 248 format_str = "%s\t%s\n" |
203 out.write(format_str % (meta_data, str(sequence))) | 249 out.write(format_str % (meta_data, str(sequence))) |
204 # Update line count. | 250 # Update line count. |
205 if isinstance(feature, egdu.GFFFeature): | 251 if isinstance(feature, egdu.GFFFeature): |
212 warn_msg = "%d warnings, 1st is: " % len(warnings) | 258 warn_msg = "%d warnings, 1st is: " % len(warnings) |
213 warn_msg += warnings[0] | 259 warn_msg += warnings[0] |
214 print(warn_msg) | 260 print(warn_msg) |
215 if skipped_lines: | 261 if skipped_lines: |
216 # Error message includes up to the first 10 skipped lines. | 262 # Error message includes up to the first 10 skipped lines. |
217 print('Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))) | 263 print( |
264 'Skipped %d invalid lines, 1st is #%d, "%s"' | |
265 % (skipped_lines, first_invalid_line, "\n".join(invalid_lines[:10])) | |
266 ) | |
218 | 267 |
219 if args.reference_genome_source == "history": | 268 if args.reference_genome_source == "history": |
220 os.remove(seq_path) | 269 os.remove(seq_path) |