Mercurial > repos > proteore > proteore_get_unique_peptide_srm_method
annotate get_unique_srm.py @ 2:b526dba9dc40 draft default tip
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
author | proteore |
---|---|
date | Mon, 10 May 2021 13:56:03 +0000 |
parents | a2b06836de90 |
children |
rev | line source |
---|---|
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
1 import argparse |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
2 import csv |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
3 import re |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
4 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
5 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
6 def get_args(): |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
7 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
8 parser = argparse.ArgumentParser() |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
9 parser.add_argument("--input_type", help="type of input (list of id or filename)", required=True) # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
10 parser.add_argument("-i", "--input", help="list of IDs (text or filename)", required=True) # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
11 parser.add_argument("--header", help="true/false if your file contains a header") # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
12 parser.add_argument("-c", "--column_number", help="list of IDs (text or filename)") # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
13 parser.add_argument("-f", "--features", help="Protein features to return from SRM Atlas", required=True) # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
14 parser.add_argument("-d", "--ref_file", help="path to reference file", required=True) # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
15 parser.add_argument("-o", "--output", help="output filename", required=True) # noqa 501 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
16 args = parser.parse_args() |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
17 return args |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
18 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
19 # return the column number in int format |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
20 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
21 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
22 def nb_col_to_int(nb_col): |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
23 try: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
24 nb_col = int(nb_col.replace("c", "")) - 1 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
25 return nb_col |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
26 except: # noqa 722 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
27 sys.exit("Please specify the column where you would like to apply the filter with valid format") # noqa 501, 821 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
28 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
29 # replace all blank cells to NA |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
30 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
31 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
32 def blank_to_NA(csv_file): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
33 tmp = [] |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
34 for line in csv_file: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
35 line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in line] # noqa 501 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
36 tmp.append(line) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
37 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
38 return tmp |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
39 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
40 # convert string to boolean |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
41 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
42 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
43 def str2bool(v): |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
44 if v.lower() in ('yes', 'true', 't', 'y', '1'): |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
45 return True |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
46 elif v.lower() in ('no', 'false', 'f', 'n', '0'): |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
47 return False |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
48 else: |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
49 raise argparse.ArgumentTypeError('Boolean value expected.') |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
50 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
51 # return list of (unique) ids from string |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
52 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
53 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
54 def get_input_ids_from_string(input): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
55 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
56 ids_list = list(set(re.split(r'\s+', input.replace("_SNP", "").replace("d_", "").replace("\r", "").replace("\n", " ").replace("\t", " ")))) # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
57 if "" in ids_list: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
58 ids_list.remove("") |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
59 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
60 return ids_list |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
61 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
62 # return input_file and list of unique ids from input file path |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
63 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
64 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
65 def get_input_ids_from_file(input, nb_col, header): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
66 with open(input, "r") as csv_file: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
67 input_file = list(csv.reader(csv_file, delimiter='\t')) |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
68 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
69 input_file, ids_list = one_id_one_line(input_file, nb_col, header) |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
70 if "" in ids_list: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
71 ids_list.remove("") |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
72 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
73 return input_file, ids_list |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
74 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
75 # function to check if an id is an uniprot accession number: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
76 # return True or False |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
77 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
78 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
79 def check_uniprot(id): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
80 uniprot_pattern = re.compile("[OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2}") # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
81 if uniprot_pattern.match(id): |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
82 return True |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
83 else: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
84 return False |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
85 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
86 # return input file by adding lines when there are more than one id per line |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
87 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
88 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
89 def one_id_one_line(input_file, nb_col, header): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
90 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
91 if header: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
92 new_file = [input_file[0]] |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
93 input_file = input_file[1:] |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
94 else: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
95 new_file = [] |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
96 ids_list = [] |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
97 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
98 for line in input_file: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
99 if line != [] and set(line) != {''}: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
100 line[nb_col] = re.sub(r"\s+", "", line[nb_col]) |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
101 if line[nb_col] == "": |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
102 line[nb_col] = 'NA' |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
103 if ";" in line[nb_col]: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
104 ids = line[nb_col].split(";") |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
105 for id in ids: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
106 new_file.append(line[:nb_col]+[id]+line[nb_col+1:]) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
107 ids_list.append(id) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
108 else: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
109 new_file.append(line) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
110 ids_list.append(line[nb_col]) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
111 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
112 ids_list = [e.replace("_SNP", "").replace("d_", "") for e in ids_list] |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
113 ids_list = list(set(ids_list)) |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
114 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
115 return new_file, ids_list |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
116 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
117 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
118 def create_srm_atlas_dictionary(features, srm_atlas_csv): |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
119 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
120 srm_atlas = {} |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
121 features_index = {"PeptideSeq": 0, "SSRT": 1, "Length": 2, "type":3, "PA_AccNum": 4, "MW": 5} # noqa 501 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
122 features_to_get = [features_index[feature] for feature in features] |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
123 for line in srm_atlas_csv[1:]: |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
124 id = line[9].replace("_SNP", "").replace("d_", "") |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
125 if id not in srm_atlas: |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
126 srm_atlas[id] = [[line[i] for i in features_to_get]] |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
127 else: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
128 srm_atlas[id].append([line[i] for i in features_to_get]) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
129 return srm_atlas |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
130 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
131 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
132 def retrieve_srm_features(srm_atlas, ids): |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
133 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
134 result_dict = {} |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
135 for id in ids: |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
136 if id in srm_atlas: |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
137 res = srm_atlas[id] |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
138 else: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
139 res = "" |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
140 result_dict[id] = res |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
141 return result_dict |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
142 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
143 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
144 def create_header(input_file, ncol, features): |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
145 col_names = list(range(1, len(input_file[0])+1)) |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
146 col_names = ["col"+str(e) for e in col_names] |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
147 col_names[ncol] = "Uniprot-AC" |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
148 col_names = col_names+features |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
149 return(col_names) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
150 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
151 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
152 def main(): |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
153 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
154 # Get args from command line |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
155 args = get_args() |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
156 features = args.features.split(",") |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
157 header = False |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
158 if args.input_type == "file": |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
159 column_number = nb_col_to_int(args.column_number) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
160 header = str2bool(args.header) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
161 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
162 # Get reference file (Human SRM Atlas) |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
163 with open(args.ref_file, "r") as csv_file: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
164 srm_atlas_csv = csv.reader(csv_file, delimiter='\t') |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
165 srm_atlas_csv = [line for line in srm_atlas_csv] |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
166 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
167 # Create srm Atlas dictionary |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
168 srm_atlas = create_srm_atlas_dictionary(features, srm_atlas_csv) |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
169 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
170 # Get file and/or ids from input |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
171 if args.input_type == "list": |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
172 ids = get_input_ids_from_string(args.input) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
173 elif args.input_type == "file": |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
174 input_file, ids = get_input_ids_from_file(args.input, |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
175 column_number, header) |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
176 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
177 # Check Uniprot-AC |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
178 if not any([check_uniprot(id) for id in ids]): |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
179 print("No Uniprot-AC found, please check your input") |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
180 exit() |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
181 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
182 # retrieve features |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
183 result_dict = retrieve_srm_features(srm_atlas, ids) |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
184 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
185 # write output |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
186 with open(args.output, "w") as output: |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
187 writer = csv.writer(output, delimiter="\t") |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
188 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
189 # write header |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
190 if header: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
191 writer.writerow(input_file[0]+features) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
192 input_file = input_file[1:] |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
193 elif args.input_type == "file": |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
194 col_names = [create_header(input_file, column_number, features)] |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
195 writer.writerow(col_names) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
196 else: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
197 writer.writerow(["Uniprot-AC"]+features) |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
198 |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
199 # write lines |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
200 previous_line = "" |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
201 if args.input_type == "file": |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
202 for line in input_file: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
203 for res in result_dict[line[column_number]]: |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
204 output_line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in line+res] # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
205 if previous_line != output_line: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
206 writer.writerow(output_line) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
207 previous_line = output_line |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
208 elif args.input_type == "list": |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
209 for id in ids: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
210 for res in result_dict[id]: |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
211 line = ["NA" if cell == "" or cell == " " or cell == "NaN" else cell for cell in [id]+res] # noqa 501 |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
212 if previous_line != line: |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
213 writer.writerow(line) |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
214 previous_line = line |
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
215 |
0
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
216 |
a2b06836de90
planemo upload commit f9de6f4e3302c41e64c39d639bee780e5eafd84d-dirty
proteore
parents:
diff
changeset
|
217 if __name__ == "__main__": |
2
b526dba9dc40
"planemo upload commit 7592fb20f8029142757d5e5fdb8f04ff6d5ed5cd-dirty"
proteore
parents:
0
diff
changeset
|
218 main() |