annotate remove_fake_cut_sites.py @ 10:22286dab5aa1 draft

planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/bionano commit bf9f16e7144d2a619863c84a4aede8c9ef520d0e
author bgruening
date Wed, 18 Jan 2023 15:22:13 +0000
parents 8cc3862f8b8e
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
4
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
1 import re
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
2 import sys
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
3
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
4 from Bio import SeqIO
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
5 from Bio.Seq import Seq
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
6
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
7
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
8 def main():
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
9
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
10 fasta_file = sys.argv[1]
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
11 output_file = sys.argv[2]
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
12 log_file = sys.argv[3]
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
13
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
14 output_handle = open(output_file, "w")
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
15 log_handle = open(log_file, "w")
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
16
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
17 with open(fasta_file, "r") as fasta_input_handle:
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
18 for record in SeqIO.parse(fasta_input_handle, "fasta"):
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
19
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
20 change_count = 0
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
21 cut_sites = [
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
22 Seq("CTTAAG"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
23 Seq("CTTCTCG"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
24 Seq("GCTCTTC"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
25 Seq("CCTCAGC"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
26 Seq("GAATGC"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
27 Seq("GCAATG"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
28 Seq("ATCGAT"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
29 Seq("CACGAG"),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
30 ]
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
31
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
32 for cut_site in cut_sites:
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
33 cut_site_both_orientations = (cut_site, cut_site.reverse_complement())
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
34
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
35 for cut_site_for_orientation in cut_site_both_orientations:
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
36
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
37 n_flank_length = 1
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
38 search_pattern = (
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
39 "N" * n_flank_length
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
40 + str(cut_site_for_orientation)
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
41 + "N" * n_flank_length
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
42 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
43 replacement = "N" * (
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
44 n_flank_length * 2 + len(cut_site_for_orientation)
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
45 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
46
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
47 (new_string, changes) = re.subn(
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
48 search_pattern,
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
49 replacement,
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
50 str(record.seq.upper()),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
51 flags=re.IGNORECASE,
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
52 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
53 change_count += changes
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
54
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
55 record.seq = Seq(new_string)
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
56
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
57 if change_count > 0:
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
58 log_handle.write(
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
59 " ".join([record.id, ":", str(change_count), "changes\n"])
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
60 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
61 SeqIO.write([record], output_handle, "fasta")
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
62
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
63 # Finally, count the matches
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
64 possible_fake_cut_sites = re.findall(
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
65 "N[^N]{1,10}N", str(record.seq.upper())
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
66 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
67 if len(possible_fake_cut_sites) > 0:
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
68 log_handle.write(
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
69 " ".join(
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
70 [
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
71 record.id,
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
72 ":",
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
73 str(len(possible_fake_cut_sites)),
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
74 "possible non-standard fake cut sites\n",
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
75 ]
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
76 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
77 )
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
78
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
79 output_handle.close()
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
80 log_handle.close()
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
81
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
82
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
83 if __name__ == "__main__":
8cc3862f8b8e "planemo upload for repository https://bionanogenomics.com/support/software-downloads/ commit a3d75aba3a21d88adb3706fbcefcaed4fbcb80fe"
bgruening
parents:
diff changeset
84 main()