comparison remove_fake_cut_sites.py @ 4:8cc3862f8b8e draft

"planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
author bgruening
date Tue, 25 May 2021 20:12:52 +0000
parents
children
comparison
equal deleted inserted replaced
3:295c0e28f4ee 4:8cc3862f8b8e
1 import re
2 import sys
3
4 from Bio import SeqIO
5 from Bio.Seq import Seq
6
7
8 def main():
9
10 fasta_file = sys.argv[1]
11 output_file = sys.argv[2]
12 log_file = sys.argv[3]
13
14 output_handle = open(output_file, "w")
15 log_handle = open(log_file, "w")
16
17 with open(fasta_file, "r") as fasta_input_handle:
18 for record in SeqIO.parse(fasta_input_handle, "fasta"):
19
20 change_count = 0
21 cut_sites = [
22 Seq("CTTAAG"),
23 Seq("CTTCTCG"),
24 Seq("GCTCTTC"),
25 Seq("CCTCAGC"),
26 Seq("GAATGC"),
27 Seq("GCAATG"),
28 Seq("ATCGAT"),
29 Seq("CACGAG"),
30 ]
31
32 for cut_site in cut_sites:
33 cut_site_both_orientations = (cut_site, cut_site.reverse_complement())
34
35 for cut_site_for_orientation in cut_site_both_orientations:
36
37 n_flank_length = 1
38 search_pattern = (
39 "N" * n_flank_length
40 + str(cut_site_for_orientation)
41 + "N" * n_flank_length
42 )
43 replacement = "N" * (
44 n_flank_length * 2 + len(cut_site_for_orientation)
45 )
46
47 (new_string, changes) = re.subn(
48 search_pattern,
49 replacement,
50 str(record.seq.upper()),
51 flags=re.IGNORECASE,
52 )
53 change_count += changes
54
55 record.seq = Seq(new_string)
56
57 if change_count > 0:
58 log_handle.write(
59 " ".join([record.id, ":", str(change_count), "changes\n"])
60 )
61 SeqIO.write([record], output_handle, "fasta")
62
63 # Finally, count the matches
64 possible_fake_cut_sites = re.findall(
65 "N[^N]{1,10}N", str(record.seq.upper())
66 )
67 if len(possible_fake_cut_sites) > 0:
68 log_handle.write(
69 " ".join(
70 [
71 record.id,
72 ":",
73 str(len(possible_fake_cut_sites)),
74 "possible non-standard fake cut sites\n",
75 ]
76 )
77 )
78
79 output_handle.close()
80 log_handle.close()
81
82
83 if __name__ == "__main__":
84 main()