Mercurial > repos > cpt > cpt_sar_finder
comparison file_operations.py @ 1:112751823323 draft
planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author | cpt |
---|---|
date | Mon, 05 Jun 2023 02:52:57 +0000 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
0:9f62910edcc9 | 1:112751823323 |
---|---|
1 def fasta_from_SAR_dict(sar_dict, fa_file): | |
2 """makes a multi fasta with candidates from SAR dictionary""" | |
3 with fa_file as f: | |
4 for data in sar_dict.values(): | |
5 f.writelines(">{}\n".format(data["description"])) | |
6 f.writelines("{}\n".format(data["sequence"])) | |
7 | |
8 | |
9 def gff3_from_SAR_dict(sar_dict, gff3_file): | |
10 """make a multi gff3 with candidates from SAR dictionary""" | |
11 gff3_cols = [ | |
12 "Seqid", | |
13 "Source", | |
14 "Type", | |
15 "Start", | |
16 "End", | |
17 "Score", | |
18 "Strand", | |
19 "Phase", | |
20 "Attributes", | |
21 ] | |
22 with gff3_file as f: | |
23 f.writelines( | |
24 f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n" | |
25 ) | |
26 if sar_dict: | |
27 # print(sar_dict) | |
28 for name, data in sar_dict.items(): | |
29 min_idx = 0 | |
30 f.writelines("##gff-version 3\n") | |
31 f.writelines(f"##sequence-region {name}\n") | |
32 n_start, n_end = split_seq_string( | |
33 data["TMD_" + str(data["biggest_sar"])][min_idx][4] | |
34 ) | |
35 sar_start, sar_end = split_seq_string( | |
36 data["TMD_" + str(data["biggest_sar"])][min_idx][5] | |
37 ) | |
38 c_start, c_end = split_seq_string( | |
39 data["TMD_" + str(data["biggest_sar"])][min_idx][6] | |
40 ) | |
41 f.writelines( | |
42 f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n' | |
43 ) | |
44 f.writelines( | |
45 f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n' | |
46 ) | |
47 f.writelines( | |
48 f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n" | |
49 ) | |
50 else: | |
51 f.writelines("##gff-version 3\n") | |
52 f.writelines(f"##sequence-region\n") | |
53 | |
54 | |
55 def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max): | |
56 """convert SAR dict to a dataframe""" | |
57 columns = [ | |
58 "Name", | |
59 "Protein Sequence", | |
60 "Protein Length", | |
61 "SAR Length", | |
62 "SAR Start", | |
63 "Putative SAR Sequence", | |
64 "SAR End", | |
65 [f"{res}%" for res in hydrophillic_res], | |
66 "% Total", | |
67 "N-term Sequence", | |
68 "N-term net Charge", | |
69 ] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res] | |
70 with stat_file as f: | |
71 f.writelines( | |
72 f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n" | |
73 ) | |
74 if sar_dict: | |
75 # print(sar_dict) | |
76 for name, data in sar_dict.items(): | |
77 for tmd_size in range(sar_max, sar_min - 1, -1): | |
78 if "TMD_" + str(tmd_size) in data: | |
79 for each_match in data["TMD_" + str(tmd_size)]: | |
80 if each_match != [""]: | |
81 # print(f"{name} - {data}") | |
82 # print(each_match) | |
83 # for perc in each_match[3]: | |
84 # print(perc) | |
85 try: | |
86 f.writelines( | |
87 f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n' | |
88 ) | |
89 except IndexError: | |
90 f.writelines( | |
91 f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n" | |
92 ) | |
93 else: | |
94 continue | |
95 | |
96 | |
97 def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max): | |
98 """summary statistics from SAR finder function""" | |
99 with stat_file as f: | |
100 f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n") | |
101 if sar_dict: | |
102 for data in sar_dict.values(): | |
103 f.writelines( | |
104 "Protein Description and Name: {}\n".format(data["description"]) | |
105 ) | |
106 f.writelines("Protein Sequence: {}\n".format(data["sequence"])) | |
107 f.writelines("Protein Length: {}\n".format(data["size"])) | |
108 f.writelines("SAR Criteria matching region(s)\n") | |
109 for tmd_size in range(sar_max, sar_min - 1, -1): | |
110 if "TMD_" + str(tmd_size) in data: | |
111 f.writelines("\nSAR length of {}:\n".format(tmd_size)) | |
112 for each_match in data["TMD_" + str(tmd_size)]: | |
113 if each_match != [""]: | |
114 f.writelines( | |
115 "\nPotential SAR domain sequence: {}\n".format( | |
116 each_match[0] | |
117 ) | |
118 ) | |
119 f.writelines( | |
120 "N-term sequence: {}\n".format(each_match[1]) | |
121 ) | |
122 f.writelines( | |
123 "N-term net charge: {}\n".format(each_match[2]) | |
124 ) | |
125 for each_perc_calc in each_match[3]: | |
126 f.writelines( | |
127 "Percent {} content: {}%\n".format( | |
128 each_perc_calc[0], each_perc_calc[1] | |
129 ) | |
130 ) | |
131 f.writelines( | |
132 "N-term coords: {}\n".format(each_match[4]) | |
133 ) | |
134 f.writelines("SAR coords: {}\n".format(each_match[5])) | |
135 f.writelines( | |
136 "C-term coords: {}\n".format(each_match[6]) | |
137 ) | |
138 f.writelines("SAR start: {}\n".format(each_match[7])) | |
139 else: | |
140 continue | |
141 f.writelines( | |
142 "========================================================\n\n" | |
143 ) | |
144 else: | |
145 f.writelines("No candidate SAR Proteins found") | |
146 | |
147 | |
148 def split_seq_string(input_range, python_indexing=True): | |
149 """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps""" | |
150 if python_indexing: | |
151 values = input_range.split("..") | |
152 start = int(values[0]) + 1 | |
153 end = int(values[1]) + 1 | |
154 else: | |
155 values = input_range.split("..") | |
156 start = values[0] | |
157 end = values[1] | |
158 | |
159 return start, end | |
160 | |
161 | |
162 if __name__ == "__main__": | |
163 pass |