cpt_sar_finder: file_operations.py comparison

comparison file_operations.py @ 1:112751823323 draft

planemo upload commit 94b0cd1fff0826c6db3e7dc0c91c0c5a8be8bb0c

author	cpt
date	Mon, 05 Jun 2023 02:52:57 +0000
parents
children

comparison

equal deleted inserted replaced

-:9f62910edcc9
+:112751823323
+def fasta_from_SAR_dict(sar_dict, fa_file):
+"""makes a multi fasta with candidates from SAR dictionary"""
+with fa_file as f:
+for data in sar_dict.values():
+f.writelines(">{}\n".format(data["description"]))
+f.writelines("{}\n".format(data["sequence"]))
+def gff3_from_SAR_dict(sar_dict, gff3_file):
+"""make a multi gff3 with candidates from SAR dictionary"""
+gff3_cols = [
+"Seqid",
+"Source",
+"Type",
+"Start",
+"End",
+"Score",
+"Strand",
+"Phase",
+"Attributes",
+]
+with gff3_file as f:
+f.writelines(
+f"{gff3_cols[0]}\t{gff3_cols[1]}\t{gff3_cols[2]}\t{gff3_cols[3]}\t{gff3_cols[4]}\t{gff3_cols[5]}\t{gff3_cols[6]}\t{gff3_cols[7]}\t{gff3_cols[8]}\n"
+)
+if sar_dict:
+# print(sar_dict)
+for name, data in sar_dict.items():
+min_idx = 0
+f.writelines("##gff-version 3\n")
+f.writelines(f"##sequence-region {name}\n")
+n_start, n_end = split_seq_string(
+data["TMD_" + str(data["biggest_sar"])][min_idx][4]
+)
+sar_start, sar_end = split_seq_string(
+data["TMD_" + str(data["biggest_sar"])][min_idx][5]
+)
+c_start, c_end = split_seq_string(
+data["TMD_" + str(data["biggest_sar"])][min_idx][6]
+)
+f.writelines(
+f'{name}\tSAR_finder\tTopological domain\t{n_start}\t{n_end}\t.\t.\t.\tNote=N-terminal net charge is {data["TMD_"+str(data["biggest_sar"])][min_idx][2]}\n'
+)
+f.writelines(
+f'{name}\tSAR_finder\tSAR domain\t{sar_start}\t{sar_end}\t.\t.\t.\tNote=residue % in SAR {[perc for perc in data["TMD_"+str(data["biggest_sar"])][min_idx][3]]},Total % is {round(sum(j for i,j in data["TMD_"+str(data["biggest_sar"])][min_idx][3]),2)}\n'
+)
+f.writelines(
+f"{name}\tSAR_finder\tTopological domain\t{c_start}\t{c_end}\t.\t.\t.\tNote=C-terminus\n"
+)
+else:
+f.writelines("##gff-version 3\n")
+f.writelines(f"##sequence-region\n")
+def tab_from_SAR_dict(sar_dict, stat_file, hydrophillic_res, sar_min, sar_max):
+"""convert SAR dict to a dataframe"""
+columns = [
+"Name",
+"Protein Sequence",
+"Protein Length",
+"SAR Length",
+"SAR Start",
+"Putative SAR Sequence",
+"SAR End",
+[f"{res}%" for res in hydrophillic_res],
+"% Total",
+"N-term Sequence",
+"N-term net Charge",
+]  # using different residues for percent calc: [f"{res}%" for res in hydrophillic_res]
+with stat_file as f:
+f.writelines(
+f"{columns[0]}\t{columns[1]}\t{columns[2]}\t{columns[3]}\t{columns[4]}\t{columns[5]}\t{columns[6]}\t{columns[7]}\t{columns[8]}\t{columns[9]}\t{columns[10]}\n"
+)
+if sar_dict:
+# print(sar_dict)
+for name, data in sar_dict.items():
+for tmd_size in range(sar_max, sar_min - 1, -1):
+if "TMD_" + str(tmd_size) in data:
+for each_match in data["TMD_" + str(tmd_size)]:
+if each_match != [""]:
+# print(f"{name} - {data}")
+# print(each_match)
+# for perc in each_match[3]:
+#    print(perc)
+try:
+f.writelines(
+f'{name}\t{data["sequence"]}\t{data["size"]}\t{tmd_size}\t{int(each_match[7])+1}\t{each_match[0]}\t{int(each_match[8])+1}\t{[perc for perc in each_match[3]]}\t{round(sum(j for i,j in each_match[3]),2)}\t{each_match[1]}\t{each_match[2]}\n'
+)
+except IndexError:
+f.writelines(
+f"ERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\tERROR\n"
+)
+else:
+continue
+def stat_file_from_SAR_dict(sar_dict, stat_file, sar_min, sar_max):
+"""summary statistics from SAR finder function"""
+with stat_file as f:
+f.writelines("..........:::::: Candidate SAR Proteins ::::::..........\n\n")
+if sar_dict:
+for data in sar_dict.values():
+f.writelines(
+"Protein Description and Name: {}\n".format(data["description"])
+)
+f.writelines("Protein Sequence: {}\n".format(data["sequence"]))
+f.writelines("Protein Length: {}\n".format(data["size"]))
+f.writelines("SAR Criteria matching region(s)\n")
+for tmd_size in range(sar_max, sar_min - 1, -1):
+if "TMD_" + str(tmd_size) in data:
+f.writelines("\nSAR length of {}:\n".format(tmd_size))
+for each_match in data["TMD_" + str(tmd_size)]:
+if each_match != [""]:
+f.writelines(
+"\nPotential SAR domain sequence: {}\n".format(
+each_match[0]
+)
+)
+f.writelines(
+"N-term sequence: {}\n".format(each_match[1])
+)
+f.writelines(
+"N-term net charge: {}\n".format(each_match[2])
+)
+for each_perc_calc in each_match[3]:
+f.writelines(
+"Percent {} content: {}%\n".format(
+each_perc_calc[0], each_perc_calc[1]
+)
+)
+f.writelines(
+"N-term coords: {}\n".format(each_match[4])
+)
+f.writelines("SAR coords: {}\n".format(each_match[5]))
+f.writelines(
+"C-term coords: {}\n".format(each_match[6])
+)
+f.writelines("SAR start: {}\n".format(each_match[7]))
+else:
+continue
+f.writelines(
+"========================================================\n\n"
+)
+else:
+f.writelines("No candidate SAR Proteins found")
+def split_seq_string(input_range, python_indexing=True):
+"""splits a #..# sequence into the two respective starts and ends, if python indexing, adds 1, otherwise keeps"""
+if python_indexing:
+values = input_range.split("..")
+start = int(values[0]) + 1
+end = int(values[1]) + 1
+else:
+values = input_range.split("..")
+start = values[0]
+end = values[1]
+return start, end
+if __name__ == "__main__":
+pass

Mercurial > repos > cpt > cpt_sar_finder

comparison file_operations.py @ 1:112751823323 draft