comparison scripts/S02b_extreme_2states.py @ 0:acc3674e515b draft default tip

planemo upload for repository htpps://github.com/abims-sbr/adaptearch commit 3c7982d775b6f3b472f6514d791edcb43cd258a1-dirty
author abims-sbr
date Fri, 01 Feb 2019 10:28:50 -0500
parents
children
comparison
equal deleted inserted replaced
-1:000000000000 0:acc3674e515b
1 #!/usr/bin/env python
2 #coding: utf-8
3 #Author : Eric Fontanillas (2010) - Victor Mataigne (2018)
4
5 import pandas as pd
6 import argparse, os
7
8 def loop_on_elems(list_of_elems, path_in, path_out, sps_group_1, sps_group_2, colnames):
9
10 # sub-routine
11
12 def tableu(fileu, sps_group_1, sps_group_2):
13 """ a
14
15 Args :
16 fileu : input file with counts of AAs / AAs types per orthogorup
17 sps_group_1 : species for condition 1 (ex : hot water species)
18 sps_group_2 : species for condition 2 (ex : cold water species)
19
20 Returns :
21 greater_dict :
22 lower_dict :
23
24 """
25 df = pd.read_csv(fileu, sep=',', index_col=0, header=0)
26 # species = list(df) #columns names = species names
27
28 # initialize counts
29 greater_dict = {}
30 lower_dict = {}
31
32 for specie in sps_group_1+sps_group_2:
33 greater_dict[specie] = 0
34 lower_dict[specie] = 0
35
36 #nb_trials = 0
37 for (index, row) in df.iterrows():
38 # min and max counts for each condition
39 if not df.loc[index, sps_group_1+sps_group_2].isnull().values.any() :
40 #nb_trials += 1
41
42 max_cat1 = max(df.loc[index, sps_group_1]) # species in category 1 (ex : hots)
43 min_cat1 = min(df.loc[index, sps_group_1])
44 max_cat2 = max(df.loc[index, sps_group_2]) # species in category 2 (ex : colds)
45 min_cat2 = min(df.loc[index, sps_group_2])
46
47 for specie in sps_group_1:
48 if df.loc[index, specie] > max_cat2 :
49 greater_dict[specie] += 1
50 elif df.loc[index, specie] < min_cat2 :
51 lower_dict[specie] += 1
52
53 for specie in sps_group_2:
54 if df.loc[index, specie] > max_cat1 :
55 greater_dict[specie] += 1
56 elif df.loc[index, specie] < min_cat1 :
57 lower_dict[specie] += 1
58
59 return greater_dict, lower_dict#, nb_trials
60
61 # Function ------------------------------------------------------
62
63 for variable in list_of_elems:
64 print 'Processing : {} ...'.format(variable)
65 file_in = "{}/{}.csv".format(path_in, variable)
66 file_out = open('{}/{}.csv'.format(path_out,variable), 'w')
67
68 # Compute succeses and fails on each variable
69 greater_dict, lower_dict = tableu(file_in, sps_group_1, sps_group_2)
70
71 # totals and diffs
72 diff_dict = {}
73 total_dict = {}
74 for key in greater_dict.keys():
75 diff_dict[key] = greater_dict[key] - lower_dict[key]
76 total_dict[key] = greater_dict[key] + lower_dict[key]
77 #total_dict[key] = number_trials
78
79 # results frame
80 df = pd.DataFrame([greater_dict, lower_dict, diff_dict, total_dict])
81 df = df.rename({0:'Greater',1:'Lower',2:'Difference',3:'Trial_Number'}) #, axis='index' if pandas 0.15
82 df = df.rename(index=str, columns=colnames)
83
84 df.to_csv("{}/{}.csv".format(path_out, variable), sep=",", encoding="utf-8")
85
86 def main():
87 parser = argparse.ArgumentParser()
88 parser.add_argument("sps_group_1", help="List of species separated by commas")
89 parser.add_argument("sps_group_2", help="List of species separated by commas")
90 parser.add_argument("format", choices=['nucleic', 'proteic'], help="input files format")
91 args = parser.parse_args()
92
93 # used only if format = nucleic
94 LN =['A','C','T','G']
95 Lratios = ['GC_percent', 'purine_percent', 'DIFF_GC', 'DIFF_AT', 'PLI_GC', 'PLI_AT', 'PLI_GC_1000', 'PLI_AT_1000']
96
97 # used only if format = proteic
98 LAA =['K','R','A','F','I','L','M','V','W','N','Q','S','T','H','Y','C','D','E','P','G']
99 LV = ['IVYWREL','EK','ERK','DNQTSHA','QH','ratio_ERK_DNQTSHA','ratio_EK_QH','FYMINK','GARP',
100 'ratio_GARP_FYMINK','AVLIM','FYW','AVLIMFYW','STNQ','RHK','DE','RHKDE','APGC','AC',
101 'VLIM','ratio_AC_VLIM','ratio_APGC_VLIM']
102 LS = ['total_residue_weight', 'total_residue_volume', 'total_partial_specific_volume', 'total_hydratation']
103
104 # inputs and outputs paths
105 if args.format == 'nucleic':
106 input_path_elem = '02_tables_per_nucleotide'
107 input_path_var = '02_tables_per_nuc_variable'
108 out_path_elem = '03_tables_counts_signTest_nucleotides'
109 out_path_var = '03_tables_counts_signTest_nuc_variables'
110 elif args.format == 'proteic':
111 input_path_elem = '02_tables_per_aa'
112 input_path_var = '02_tables_per_aa_variable'
113 out_path_elem = '03_tables_counts_signTest_aa'
114 out_path_var = '03_tables_counts_signTest_aa_variables'
115
116 os.mkdir(out_path_elem)
117 os.mkdir(out_path_var)
118
119 sps_group_1 = args.sps_group_1.split(',')
120 sps_group_2 = args.sps_group_2.split(',')
121
122 # Prepare colnames for final frames
123 colnames = {}
124 # for specie in sps_group_1:
125 # colnames[specie] = '{}_vs_condition_1'.format(specie)
126 # for specie in sps_group_2:
127 # colnames[specie] = '{}_vs_condition_2'.format(specie)
128
129 for specie in sps_group_1:
130 colnames[specie] = '{}_vs_{}'.format(specie, args.sps_group_2.replace(',',''))
131 for specie in sps_group_2:
132 colnames[specie] = '{}_vs_{}'.format(specie, args.sps_group_1.replace(',',''))
133
134 # Building tables
135 if args.format == 'nucleic':
136 loop_on_elems(LN, input_path_elem, out_path_elem, sps_group_1, sps_group_2, colnames)
137 loop_on_elems(Lratios, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames)
138 elif args.format == 'proteic':
139 loop_on_elems(LAA, input_path_elem, out_path_elem, sps_group_1, sps_group_2, colnames)
140 loop_on_elems(LV, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames)
141 loop_on_elems(LS, input_path_var, out_path_var, sps_group_1, sps_group_2, colnames)
142
143 # Final R script launching sign test
144 print 'Processing : binomial sign tests ...'
145
146 if args.format == 'nucleic':
147 final_output_elem = '04_outputs_nucleotides'
148 final_output_var = '04_outputs_nuc_variables'
149 elif args.format == 'proteic':
150 final_output_elem = '04_outputs_aa'
151 final_output_var = '04_outputs_aa_variables'
152
153 os.mkdir(final_output_elem)
154 os.mkdir(final_output_var)
155 os.system('Rscript S03b_sign_test_binomial.R --indir %s --outdir %s' %(out_path_elem, final_output_elem))
156 os.system('Rscript S03b_sign_test_binomial.R --indir %s --outdir %s' %(out_path_var, final_output_var))
157
158 if __name__ == '__main__':
159 main()