comparison extract_genomic_dna.py @ 11:80414c33a59a draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/extract_genomic_dna commit 6db2d98b513e4980788fcba49d809c91e5750296
author iuc
date Thu, 21 Nov 2024 07:20:29 +0000
parents e400dcbc60d0
children
comparison
equal deleted inserted replaced
10:5cc8e93ee98f 11:80414c33a59a
9 from bx.intervals.io import Comment, Header 9 from bx.intervals.io import Comment, Header
10 10
11 import extract_genomic_dna_utils as egdu # noqa: I100,I202 11 import extract_genomic_dna_utils as egdu # noqa: I100,I202
12 12
13 parser = argparse.ArgumentParser() 13 parser = argparse.ArgumentParser()
14 parser.add_argument('--input_format', dest='input_format', help="Input dataset format") 14 parser.add_argument("--input_format", dest="input_format", help="Input dataset format")
15 parser.add_argument('--input', dest='input', help="Input dataset") 15 parser.add_argument("--input", dest="input", help="Input dataset")
16 parser.add_argument('--genome', dest='genome', help="Input dataset genome build") 16 parser.add_argument("--genome", dest="genome", help="Input dataset genome build")
17 parser.add_argument('--interpret_features', dest='interpret_features', default=None, help="Interpret features if input format is gff") 17 parser.add_argument(
18 parser.add_argument('--columns', dest='columns', help="Columns to use in input file") 18 "--interpret_features",
19 parser.add_argument('--reference_genome_source', dest='reference_genome_source', help="Source of reference genome file") 19 dest="interpret_features",
20 parser.add_argument('--reference_genome', dest='reference_genome', help="Reference genome file") 20 default=None,
21 parser.add_argument('--output_format', dest='output_format', help="Output format") 21 help="Interpret features if input format is gff",
22 parser.add_argument('--fasta_header_type', dest='fasta_header_type', default=None, help="Fasta header format") 22 )
23 parser.add_argument('--fasta_header_delimiter', dest='fasta_header_delimiter', default=None, help="Fasta header field delimiter") 23 parser.add_argument("--columns", dest="columns", help="Columns to use in input file")
24 parser.add_argument('--output', dest='output', help="Output dataset") 24 parser.add_argument(
25 "--reference_genome_source",
26 dest="reference_genome_source",
27 help="Source of reference genome file",
28 )
29 parser.add_argument(
30 "--reference_genome", dest="reference_genome", help="Reference genome file"
31 )
32 parser.add_argument("--output_format", dest="output_format", help="Output format")
33 parser.add_argument(
34 "--fasta_header_type",
35 dest="fasta_header_type",
36 default=None,
37 help="Fasta header format",
38 )
39 parser.add_argument(
40 "--fasta_header_delimiter",
41 dest="fasta_header_delimiter",
42 default=None,
43 help="Fasta header field delimiter",
44 )
45 parser.add_argument("--output", dest="output", help="Output dataset")
25 args = parser.parse_args() 46 args = parser.parse_args()
26 47
27 input_is_gff = args.input_format == 'gff' 48 input_is_gff = args.input_format == "gff"
28 interpret_features = input_is_gff and args.interpret_features == "yes" 49 interpret_features = input_is_gff and args.interpret_features == "yes"
29 if len(args.columns.split(',')) == 5: 50 if len(args.columns.split(",")) == 5:
30 # Bed file. 51 # Bed file.
31 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(args.columns) 52 chrom_col, start_col, end_col, strand_col, name_col = egdu.parse_cols_arg(
53 args.columns
54 )
32 else: 55 else:
33 # Gff file. 56 # Gff file.
34 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns) 57 chrom_col, start_col, end_col, strand_col = egdu.parse_cols_arg(args.columns)
35 name_col = False 58 name_col = False
36 59
45 nibs = {} 68 nibs = {}
46 skipped_lines = 0 69 skipped_lines = 0
47 first_invalid_line = 0 70 first_invalid_line = 0
48 invalid_lines = [] 71 invalid_lines = []
49 warnings = [] 72 warnings = []
50 warning = '' 73 warning = ""
51 twobitfile = None 74 twobitfile = None
52 line_count = 1 75 line_count = 1
53 file_iterator = open(args.input) 76 file_iterator = open(args.input)
54 if interpret_features: 77 if interpret_features:
55 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False) 78 file_iterator = egdu.GFFReaderWrapper(file_iterator, fix_strand=False)
56 out = open(args.output, 'wt') 79 out = open(args.output, "wt")
57 80
58 for feature in file_iterator: 81 for feature in file_iterator:
59 # Ignore comments, headers. 82 # Ignore comments, headers.
60 if isinstance(feature, (Header, Comment)): 83 if isinstance(feature, (Header, Comment)):
61 line_count += 1 84 line_count += 1
68 start = feature.start 91 start = feature.start
69 end = feature.end 92 end = feature.end
70 strand = feature.strand 93 strand = feature.strand
71 else: 94 else:
72 # Processing lines, either interval or GFF format. 95 # Processing lines, either interval or GFF format.
73 line = feature.rstrip('\r\n') 96 line = feature.rstrip("\r\n")
74 if line and not line.startswith("#"): 97 if line and not line.startswith("#"):
75 fields = line.split('\t') 98 fields = line.split("\t")
76 try: 99 try:
77 chrom = fields[chrom_col] 100 chrom = fields[chrom_col]
78 start = int(fields[start_col]) 101 start = int(fields[start_col])
79 end = int(fields[end_col]) 102 end = int(fields[end_col])
80 if name_col: 103 if name_col:
97 if not invalid_lines: 120 if not invalid_lines:
98 invalid_lines = egdu.get_lines(feature) 121 invalid_lines = egdu.get_lines(feature)
99 first_invalid_line = line_count 122 first_invalid_line = line_count
100 skipped_lines += len(invalid_lines) 123 skipped_lines += len(invalid_lines)
101 continue 124 continue
102 if strand not in ['+', '-']: 125 if strand not in ["+", "-"]:
103 strand = '+' 126 strand = "+"
104 sequence = '' 127 sequence = ""
105 else: 128 else:
106 continue 129 continue
107 # Open sequence file and get sequence for feature/interval. 130 # Open sequence file and get sequence for feature/interval.
108 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)): 131 if os.path.exists("%s/%s.nib" % (seq_dir, chrom)):
109 if chrom in nibs: 132 if chrom in nibs:
110 nib = nibs[chrom] 133 nib = nibs[chrom]
111 else: 134 else:
112 nibs[chrom] = nib = bx.seq.nib.NibFile(open("%s/%s.nib" % (seq_path, chrom))) 135 nibs[chrom] = nib = bx.seq.nib.NibFile(
136 open("%s/%s.nib" % (seq_path, chrom))
137 )
113 try: 138 try:
114 sequence = nib.get(start, end - start) 139 sequence = nib.get(start, end - start)
115 except Exception as e: 140 except Exception:
116 warning = "Unable to fetch the sequence from '%d' to '%d' for build '%s'. " % (start, end - start, args.genome) 141 warning = (
142 "Unable to fetch the sequence from '%d' to '%d' for build '%s'. "
143 % (start, end - start, args.genome)
144 )
117 warnings.append(warning) 145 warnings.append(warning)
118 if not invalid_lines: 146 if not invalid_lines:
119 invalid_lines = egdu.get_lines(feature) 147 invalid_lines = egdu.get_lines(feature)
120 first_invalid_line = line_count 148 first_invalid_line = line_count
121 skipped_lines += len(invalid_lines) 149 skipped_lines += len(invalid_lines)
122 continue 150 continue
123 elif os.path.isfile(seq_path): 151 elif os.path.isfile(seq_path):
124 if not(twobitfile): 152 if not (twobitfile):
125 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path)) 153 twobitfile = bx.seq.twobit.TwoBitFile(open(seq_path))
126 try: 154 try:
127 if interpret_features: 155 if interpret_features:
128 # Create sequence from intervals within a feature. 156 # Create sequence from intervals within a feature.
129 sequence = '' 157 sequence = ""
130 for interval in feature.intervals: 158 for interval in feature.intervals:
131 sequence += twobitfile[interval.chrom][interval.start:interval.end] 159 sequence += twobitfile[interval.chrom][
160 interval.start: interval.end
161 ]
132 else: 162 else:
133 sequence = twobitfile[chrom][start:end] 163 sequence = twobitfile[chrom][start:end]
134 except Exception: 164 except Exception:
135 warning = "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. " % (start, end - start, chrom) 165 warning = (
166 "Unable to fetch the sequence from '%d' to '%d' for chrom '%s'. "
167 % (start, end - start, chrom)
168 )
136 warnings.append(warning) 169 warnings.append(warning)
137 if not invalid_lines: 170 if not invalid_lines:
138 invalid_lines = egdu.get_lines(feature) 171 invalid_lines = egdu.get_lines(feature)
139 first_invalid_line = line_count 172 first_invalid_line = line_count
140 skipped_lines += len(invalid_lines) 173 skipped_lines += len(invalid_lines)
141 continue 174 continue
142 else: 175 else:
143 warning = "Chromosome by name '%s' was not found for build '%s'. " % (chrom, args.genome) 176 warning = "Chromosome by name '%s' was not found for build '%s'. " % (
177 chrom,
178 args.genome,
179 )
144 warnings.append(warning) 180 warnings.append(warning)
145 if not invalid_lines: 181 if not invalid_lines:
146 invalid_lines = egdu.get_lines(feature) 182 invalid_lines = egdu.get_lines(feature)
147 first_invalid_line = line_count 183 first_invalid_line = line_count
148 skipped_lines += len(invalid_lines) 184 skipped_lines += len(invalid_lines)
149 continue 185 continue
150 if sequence == '': 186 if sequence == "":
151 warning = "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. " % (chrom, start, end, args.genome) 187 warning = (
188 "Chrom: '%s', start: '%d', end: '%d' is either invalid or not present in build '%s'. "
189 % (chrom, start, end, args.genome)
190 )
152 warnings.append(warning) 191 warnings.append(warning)
153 if not invalid_lines: 192 if not invalid_lines:
154 invalid_lines = egdu.get_lines(feature) 193 invalid_lines = egdu.get_lines(feature)
155 first_invalid_line = line_count 194 first_invalid_line = line_count
156 skipped_lines += len(invalid_lines) 195 skipped_lines += len(invalid_lines)
159 sequence = egdu.reverse_complement(sequence) 198 sequence = egdu.reverse_complement(sequence)
160 if args.output_format == "fasta": 199 if args.output_format == "fasta":
161 if input_is_gff: 200 if input_is_gff:
162 start, end = egdu.convert_bed_coords_to_gff([start, end]) 201 start, end = egdu.convert_bed_coords_to_gff([start, end])
163 if args.fasta_header_type == "bedtools_getfasta_default": 202 if args.fasta_header_type == "bedtools_getfasta_default":
164 out.write(">%s\n" % egdu.get_bedtools_getfasta_default_header(str(chrom), 203 out.write(
165 str(start), 204 ">%s\n"
166 str(end), 205 % egdu.get_bedtools_getfasta_default_header(
167 strand, 206 str(chrom), str(start), str(end), strand, includes_strand_col
168 includes_strand_col)) 207 )
208 )
169 else: 209 else:
170 # args.fasta_header_type == "char_delimited": 210 # args.fasta_header_type == "char_delimited":
171 fields = [args.genome, str(chrom), str(start), str(end), strand] 211 fields = [args.genome, str(chrom), str(start), str(end), strand]
172 field_delimiter = egdu.get_fasta_header_delimiter(args.fasta_header_delimiter) 212 field_delimiter = egdu.get_fasta_header_delimiter(
213 args.fasta_header_delimiter
214 )
173 meta_data = field_delimiter.join(fields) 215 meta_data = field_delimiter.join(fields)
174 if name.strip(): 216 if name.strip():
175 out.write(">%s %s\n" % (meta_data, name)) 217 out.write(">%s %s\n" % (meta_data, name))
176 else: 218 else:
177 out.write(">%s\n" % meta_data) 219 out.write(">%s\n" % meta_data)
182 out.write("%s\n" % str(sequence[c:b])) 224 out.write("%s\n" % str(sequence[c:b]))
183 c = b 225 c = b
184 else: 226 else:
185 # output_format == "interval". 227 # output_format == "interval".
186 if interpret_features: 228 if interpret_features:
187 meta_data = "\t".join([feature.chrom, 229 meta_data = "\t".join(
188 "galaxy_extract_genomic_dna", 230 [
189 "interval", 231 feature.chrom,
190 str(feature.start), 232 "galaxy_extract_genomic_dna",
191 str(feature.end), 233 "interval",
192 feature.score, 234 str(feature.start),
193 feature.strand, 235 str(feature.end),
194 ".", 236 feature.score,
195 egdu.gff_attributes_to_str(feature.attributes, "GTF")]) 237 feature.strand,
238 ".",
239 egdu.gff_attributes_to_str(feature.attributes, "GTF"),
240 ]
241 )
196 else: 242 else:
197 # Here fields was set up around line 73. 243 # Here fields was set up around line 73.
198 meta_data = "\t".join(fields) 244 meta_data = "\t".join(fields)
199 if input_is_gff: 245 if input_is_gff:
200 format_str = "%s seq \"%s\";\n" 246 format_str = '%s seq "%s";\n'
201 else: 247 else:
202 format_str = "%s\t%s\n" 248 format_str = "%s\t%s\n"
203 out.write(format_str % (meta_data, str(sequence))) 249 out.write(format_str % (meta_data, str(sequence)))
204 # Update line count. 250 # Update line count.
205 if isinstance(feature, egdu.GFFFeature): 251 if isinstance(feature, egdu.GFFFeature):
212 warn_msg = "%d warnings, 1st is: " % len(warnings) 258 warn_msg = "%d warnings, 1st is: " % len(warnings)
213 warn_msg += warnings[0] 259 warn_msg += warnings[0]
214 print(warn_msg) 260 print(warn_msg)
215 if skipped_lines: 261 if skipped_lines:
216 # Error message includes up to the first 10 skipped lines. 262 # Error message includes up to the first 10 skipped lines.
217 print('Skipped %d invalid lines, 1st is #%d, "%s"' % (skipped_lines, first_invalid_line, '\n'.join(invalid_lines[:10]))) 263 print(
264 'Skipped %d invalid lines, 1st is #%d, "%s"'
265 % (skipped_lines, first_invalid_line, "\n".join(invalid_lines[:10]))
266 )
218 267
219 if args.reference_genome_source == "history": 268 if args.reference_genome_source == "history":
220 os.remove(seq_path) 269 os.remove(seq_path)