comparison file_operations.py @ 1:112751823323 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c
author cpt
date Mon, 05 Jun 2023 02:52:57 +0000
parents
children
comparison
equal deleted inserted replaced
0:9f62910edcc9 1:112751823323
1 def fasta_from_SAR_dict(sar_dict, fa_file):
2 """makes a multi fasta with candidates from SAR dictionary"""
3 with fa_file as f:
4 for data in sar_dict.values():
5 f.writelines(">{}\n".format(data["description"]))
6 f.writelines("{}\n".format(data["sequence"]))
7
8
9 def gff3_from_SAR_dict(sar_dict, gff3_file):
10 """make a multi gff3 with candidates from SAR dictionary"""
11 gff3_cols = [
12 "Seqid",
13 "Source",
14 "Type",
15 "Start",
16 "End",
17 "Score",
18 "Strand",
19 "Phase",
20 "Attributes",
21 ]
22 with gff3_file as f:
23 f.writelines(
24 f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n"
25 )
26 if sar_dict:
27 # print(sar_dict)
28 for name, data in sar_dict.items():
29 min_idx = 0
30 f.writelines("##gff-version 3\n")
31 f.writelines(f"##sequence-region {name}\n")
32 n_start, n_end = split_seq_string(
33 data["TMD_" + str(data["biggest_sar"])][min_idx][4]
34 )
35 sar_start, sar_end = split_seq_string(
36 data["TMD_" + str(data["biggest_sar"])][min_idx][5]
37 )
38 c_start, c_end = split_seq_string(
39 data["TMD_" + str(data["biggest_sar"])][min_idx][6]
40 )
41 f.writelines(
42 f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n'
43 )
44 f.writelines(
45 f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n'
46 )
47 f.writelines(
48 f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n"
49 )
50 else:
51 f.writelines("##gff-version 3\n")
52 f.writelines(f"##sequence-region\n")
53
54
55 def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max):
56 """convert SAR dict to a dataframe"""
57 columns = [
58 "Name",
59 "Protein Sequence",
60 "Protein Length",
61 "SAR Length",
62 "SAR Start",
63 "Putative SAR Sequence",
64 "SAR End",
65 [f"{res}%" for res in hydrophillic_res],
66 "% Total",
67 "N-term Sequence",
68 "N-term net Charge",
69 ] # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
70 with stat_file as f:
71 f.writelines(
72 f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n"
73 )
74 if sar_dict:
75 # print(sar_dict)
76 for name, data in sar_dict.items():
77 for tmd_size in range(sar_max, sar_min - 1, -1):
78 if "TMD_" + str(tmd_size) in data:
79 for each_match in data["TMD_" + str(tmd_size)]:
80 if each_match != [""]:
81 # print(f"{name} - {data}")
82 # print(each_match)
83 # for perc in each_match[3]:
84 # print(perc)
85 try:
86 f.writelines(
87 f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n'
88 )
89 except IndexError:
90 f.writelines(
91 f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n"
92 )
93 else:
94 continue
95
96
97 def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
98 """summary statistics from SAR finder function"""
99 with stat_file as f:
100 f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
101 if sar_dict:
102 for data in sar_dict.values():
103 f.writelines(
104 "Protein Description and Name: {}\n".format(data["description"])
105 )
106 f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
107 f.writelines("Protein Length: {}\n".format(data["size"]))
108 f.writelines("SAR Criteria matching region(s)\n")
109 for tmd_size in range(sar_max, sar_min - 1, -1):
110 if "TMD_" + str(tmd_size) in data:
111 f.writelines("\nSAR length of {}:\n".format(tmd_size))
112 for each_match in data["TMD_" + str(tmd_size)]:
113 if each_match != [""]:
114 f.writelines(
115 "\nPotential SAR domain sequence: {}\n".format(
116 each_match[0]
117 )
118 )
119 f.writelines(
120 "N-term sequence: {}\n".format(each_match[1])
121 )
122 f.writelines(
123 "N-term net charge: {}\n".format(each_match[2])
124 )
125 for each_perc_calc in each_match[3]:
126 f.writelines(
127 "Percent {} content: {}%\n".format(
128 each_perc_calc[0], each_perc_calc[1]
129 )
130 )
131 f.writelines(
132 "N-term coords: {}\n".format(each_match[4])
133 )
134 f.writelines("SAR coords: {}\n".format(each_match[5]))
135 f.writelines(
136 "C-term coords: {}\n".format(each_match[6])
137 )
138 f.writelines("SAR start: {}\n".format(each_match[7]))
139 else:
140 continue
141 f.writelines(
142 "========================================================\n\n"
143 )
144 else:
145 f.writelines("No candidate SAR Proteins found")
146
147
148 def split_seq_string(input_range, python_indexing=True):
149 """splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps"""
150 if python_indexing:
151 values = input_range.split("..")
152 start = int(values[0]) + 1
153 end = int(values[1]) + 1
154 else:
155 values = input_range.split("..")
156 start = values[0]
157 end = values[1]
158
159 return start, end
160
161
162 if __name__ == "__main__":
163 pass