Mercurial > repos > petr-novak > profrep
comparison profrep_masking.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a5f1638b73be |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 from Bio import SeqIO | |
5 from Bio.Seq import MutableSeq | |
6 from Bio.Alphabet import generic_dna | |
7 import sys | |
8 | |
9 | |
10 def main(args): | |
11 # Command line arguments | |
12 QUERY = args.query | |
13 MODE = args.mode | |
14 REPEAT_GFF = args.repeat_gff | |
15 MASKED = args.output_masked | |
16 | |
17 repeats_all = get_indices(REPEAT_GFF) | |
18 | |
19 if MODE == "lowercase": | |
20 lower_mask(QUERY, repeats_all, MASKED) | |
21 else: | |
22 N_mask(QUERY, repeats_all, MASKED) | |
23 | |
24 | |
25 def get_indices(REPEAT_GFF): | |
26 ''' | |
27 Get indices of repeats from GFF file to mask | |
28 ''' | |
29 repeats_all = {} | |
30 with open(REPEAT_GFF, "r") as repeats_gff: | |
31 for line in repeats_gff: | |
32 if not line.startswith("#"): | |
33 seq_id = line.split("\t")[0] | |
34 start_r = line.split("\t")[3] | |
35 end_r = line.split("\t")[4] | |
36 if seq_id in repeats_all.keys(): | |
37 repeats_all[seq_id].append([int(start_r), int(end_r)]) | |
38 else: | |
39 repeats_all[seq_id] = [[int(start_r), int(end_r)]] | |
40 return repeats_all | |
41 | |
42 | |
43 def lower_mask(QUERY, repeats_all, MASKED): | |
44 allSeqs = list(SeqIO.parse(QUERY, 'fasta')) | |
45 for singleSeq in allSeqs: | |
46 mutable = MutableSeq(str(singleSeq.seq), generic_dna) | |
47 for index in repeats_all[singleSeq.id]: | |
48 for item in range(index[0] - 1, index[1]): | |
49 mutable[item] = mutable[item].lower() | |
50 singleSeq.seq = mutable | |
51 with open(MASKED, "w") as handle: | |
52 SeqIO.write(allSeqs, handle, 'fasta') | |
53 | |
54 | |
55 def N_mask(QUERY, repeats_all, MASKED): | |
56 allSeqs = list(SeqIO.parse(QUERY, 'fasta')) | |
57 for singleSeq in allSeqs: | |
58 mutable = MutableSeq(str(singleSeq.seq), generic_dna) | |
59 for index in repeats_all[singleSeq.id]: | |
60 for item in range(index[0] - 1, index[1]): | |
61 mutable[item] = "N" | |
62 singleSeq.seq = mutable | |
63 with open(MASKED, "w") as handle: | |
64 SeqIO.write(allSeqs, handle, 'fasta') | |
65 | |
66 | |
67 if __name__ == "__main__": | |
68 | |
69 # Command line arguments | |
70 parser = argparse.ArgumentParser() | |
71 parser.add_argument('-q', | |
72 '--query', | |
73 type=str, | |
74 required=True, | |
75 help='query sequence to be processed') | |
76 parser.add_argument('-rg', | |
77 '--repeat_gff', | |
78 type=str, | |
79 required=True, | |
80 help='query sequence to be processed') | |
81 parser.add_argument('-m', | |
82 '--mode', | |
83 default="lowercase", | |
84 choices=['lowercase', 'N'], | |
85 help='query sequence to be processed') | |
86 parser.add_argument('-o', | |
87 '--output_masked', | |
88 type=str, | |
89 default="output_masked", | |
90 help='query sequence to be processed') | |
91 | |
92 args = parser.parse_args() | |
93 main(args) |