annotate get_unique_srm.py @ 2:b526dba9dc40 draft default tip

"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
author proteore
date Mon, 10 May 2021 13:56:03 +0000
parents a2b06836de90
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
1 import argparse
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
2 import csv
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
3 import re
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
4
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
5
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
6 def get_args():
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
7
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
8 parser = argparse.ArgumentParser()
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
9 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True) # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
10 parser.add_argument("-i", "--input", help="list of IDs (text or filename)", required=True) # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
11 parser.add_argument("--header", help="true/false if your file contains a header") # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
12 parser.add_argument("-c", "--column_number", help="list of IDs (text or filename)") # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
13 parser.add_argument("-f", "--features", help="Protein features to return from SRM Atlas", required=True) # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
14 parser.add_argument("-d", "--ref_file", help="path to reference file", required=True) # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
15 parser.add_argument("-o", "--output", help="output filename", required=True) # noqa 501
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
16 args = parser.parse_args()
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
17 return args
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
18
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
19 # return the column number in int format
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
20
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
21
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
22 def nb_col_to_int(nb_col):
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
23 try:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
24 nb_col = int(nb_col.replace("c", "")) - 1
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
25 return nb_col
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
26 except: # noqa 722
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
27 sys.exit("Please specify the column where you would like to apply the filter with valid format") # noqa 501, 821
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
28
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
29 # replace all blank cells to NA
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
30
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
31
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
32 def blank_to_NA(csv_file):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
33 tmp = []
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
34 for line in csv_file:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
35 line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in line] # noqa 501
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
36 tmp.append(line)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
37
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
38 return tmp
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
39
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
40 # convert string to boolean
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
41
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
42
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
43 def str2bool(v):
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
44 if v.lower() in ('yes', 'true', 't', 'y', '1'):
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
45 return True
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
46 elif v.lower() in ('no', 'false', 'f', 'n', '0'):
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
47 return False
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
48 else:
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
49 raise argparse.ArgumentTypeError('Boolean value expected.')
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
50
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
51 # return list of (unique) ids from string
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
52
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
53
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
54 def get_input_ids_from_string(input):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
55
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
56 ids_list = list(set(re.split(r'\s+', input.replace("_SNP", "").replace("d_", "").replace("\r", "").replace("\n", " ").replace("\t", " ")))) # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
57 if "" in ids_list:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
58 ids_list.remove("")
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
59
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
60 return ids_list
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
61
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
62 # return input_file and list of unique ids from input file path
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
63
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
64
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
65 def get_input_ids_from_file(input, nb_col, header):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
66 with open(input, "r") as csv_file:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
67 input_file = list(csv.reader(csv_file, delimiter='\t'))
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
68
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
69 input_file, ids_list = one_id_one_line(input_file, nb_col, header)
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
70 if "" in ids_list:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
71 ids_list.remove("")
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
72
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
73 return input_file, ids_list
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
74
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
75 # function to check if an id is an uniprot accession number:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
76 # return True or False
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
77
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
78
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
79 def check_uniprot(id):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
80 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
81 if uniprot_pattern.match(id):
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
82 return True
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
83 else:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
84 return False
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
85
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
86 # return input file by adding lines when there are more than one id per line
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
87
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
88
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
89 def one_id_one_line(input_file, nb_col, header):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
90
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
91 if header:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
92 new_file = [input_file[0]]
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
93 input_file = input_file[1:]
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
94 else:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
95 new_file = []
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
96 ids_list = []
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
97
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
98 for line in input_file:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
99 if line != [] and set(line) != {''}:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
100 line[nb_col] = re.sub(r"\s+", "", line[nb_col])
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
101 if line[nb_col] == "":
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
102 line[nb_col] = 'NA'
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
103 if ";" in line[nb_col]:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
104 ids = line[nb_col].split(";")
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
105 for id in ids:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
106 new_file.append(line[:nb_col]+[id]+line[nb_col+1:])
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
107 ids_list.append(id)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
108 else:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
109 new_file.append(line)
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
110 ids_list.append(line[nb_col])
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
111
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
112 ids_list = [e.replace("_SNP", "").replace("d_", "") for e in ids_list]
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
113 ids_list = list(set(ids_list))
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
114
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
115 return new_file, ids_list
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
116
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
117
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
118 def create_srm_atlas_dictionary(features, srm_atlas_csv):
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
119
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
120 srm_atlas = {}
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
121 features_index = {"PeptideSeq": 0, "SSRT": 1, "Length": 2, "type":3, "PA_AccNum": 4, "MW": 5} # noqa 501
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
122 features_to_get = [features_index[feature] for feature in features]
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
123 for line in srm_atlas_csv[1:]:
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
124 id = line[9].replace("_SNP", "").replace("d_", "")
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
125 if id not in srm_atlas:
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
126 srm_atlas[id] = [[line[i] for i in features_to_get]]
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
127 else:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
128 srm_atlas[id].append([line[i] for i in features_to_get])
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
129 return srm_atlas
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
130
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
131
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
132 def retrieve_srm_features(srm_atlas, ids):
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
133
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
134 result_dict = {}
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
135 for id in ids:
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
136 if id in srm_atlas:
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
137 res = srm_atlas[id]
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
138 else:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
139 res = ""
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
140 result_dict[id] = res
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
141 return result_dict
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
142
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
143
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
144 def create_header(input_file, ncol, features):
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
145 col_names = list(range(1, len(input_file[0])+1))
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
146 col_names = ["col"+str(e) for e in col_names]
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
147 col_names[ncol] = "Uniprot-AC"
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
148 col_names = col_names+features
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
149 return(col_names)
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
150
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
151
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
152 def main():
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
153
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
154 # Get args from command line
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
155 args = get_args()
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
156 features = args.features.split(",")
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
157 header = False
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
158 if args.input_type == "file":
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
159 column_number = nb_col_to_int(args.column_number)
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
160 header = str2bool(args.header)
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
161
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
162 # Get reference file (Human SRM Atlas)
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
163 with open(args.ref_file, "r") as csv_file:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
164 srm_atlas_csv = csv.reader(csv_file, delimiter='\t')
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
165 srm_atlas_csv = [line for line in srm_atlas_csv]
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
166
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
167 # Create srm Atlas dictionary
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
168 srm_atlas = create_srm_atlas_dictionary(features, srm_atlas_csv)
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
169
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
170 # Get file and/or ids from input
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
171 if args.input_type == "list":
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
172 ids = get_input_ids_from_string(args.input)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
173 elif args.input_type == "file":
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
174 input_file, ids = get_input_ids_from_file(args.input,
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
175 column_number, header)
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
176
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
177 # Check Uniprot-AC
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
178 if not any([check_uniprot(id) for id in ids]):
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
179 print("No Uniprot-AC found, please check your input")
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
180 exit()
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
181
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
182 # retrieve features
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
183 result_dict = retrieve_srm_features(srm_atlas, ids)
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
184
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
185 # write output
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
186 with open(args.output, "w") as output:
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
187 writer = csv.writer(output, delimiter="\t")
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
188
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
189 # write header
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
190 if header:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
191 writer.writerow(input_file[0]+features)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
192 input_file = input_file[1:]
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
193 elif args.input_type == "file":
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
194 col_names = [create_header(input_file, column_number, features)]
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
195 writer.writerow(col_names)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
196 else:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
197 writer.writerow(["Uniprot-AC"]+features)
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
198
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
199 # write lines
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
200 previous_line = ""
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
201 if args.input_type == "file":
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
202 for line in input_file:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
203 for res in result_dict[line[column_number]]:
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
204 output_line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in line+res] # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
205 if previous_line != output_line:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
206 writer.writerow(output_line)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
207 previous_line = output_line
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
208 elif args.input_type == "list":
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
209 for id in ids:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
210 for res in result_dict[id]:
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
211 line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in [id]+res] # noqa 501
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
212 if previous_line != line:
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
213 writer.writerow(line)
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
214 previous_line = line
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
215
0
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
216
a2b06836de90 planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff changeset
217 if __name__ == "__main__":
2
b526dba9dc40 "planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents: 0
diff changeset
218 main()