0
|
1 #!/usr/bin/env python
|
|
2
|
|
3 import argparse
|
|
4 import os
|
3
|
5 import re
|
|
6
|
0
|
7 import pandas
|
|
8 import pandas.io.formats.excel
|
|
9 from Bio import SeqIO
|
|
10
|
|
11 # Maximum columns allowed in a LibreOffice
|
|
12 # spreadsheet is 1024. Excel allows for
|
|
13 # 16,384 columns, but we'll set the lower
|
1
|
14 # number as the maximum. Some browsers
|
|
15 # (e.g., Firefox on Linux) are configured
|
|
16 # to use LibreOffice for Excel spreadsheets.
|
|
17 MAXCOLS = 1024
|
0
|
18 OUTPUT_EXCEL_DIR = 'output_excel_dir'
|
|
19
|
|
20
|
|
21 def annotate_table(table_df, group, annotation_dict):
|
|
22 for gbk_chrome, pro in list(annotation_dict.items()):
|
|
23 ref_pos = list(table_df)
|
|
24 ref_series = pandas.Series(ref_pos)
|
|
25 ref_df = pandas.DataFrame(ref_series.str.split(':', expand=True).values, columns=['reference', 'position'])
|
|
26 all_ref = ref_df[ref_df['reference'] == gbk_chrome]
|
|
27 positions = all_ref.position.to_frame()
|
|
28 # Create an annotation file.
|
|
29 annotation_file = "%s_annotations.csv" % group
|
|
30 with open(annotation_file, "a") as fh:
|
3
|
31 for _, row in positions.iterrows():
|
0
|
32 pos = row.position
|
|
33 try:
|
|
34 aaa = pro.iloc[pro.index.get_loc(int(pos))][['chrom', 'locus', 'product', 'gene']]
|
|
35 try:
|
|
36 chrom, name, locus, tag = aaa.values[0]
|
|
37 print("{}:{}\t{}, {}, {}".format(chrom, pos, locus, tag, name), file=fh)
|
|
38 except ValueError:
|
|
39 # If only one annotation for the entire
|
|
40 # chromosome (e.g., flu) then having [0] fails
|
|
41 chrom, name, locus, tag = aaa.values
|
|
42 print("{}:{}\t{}, {}, {}".format(chrom, pos, locus, tag, name), file=fh)
|
|
43 except KeyError:
|
|
44 print("{}:{}\tNo annotated product".format(gbk_chrome, pos), file=fh)
|
|
45 # Read the annotation file into a data frame.
|
|
46 annotations_df = pandas.read_csv(annotation_file, sep='\t', header=None, names=['index', 'annotations'], index_col='index')
|
|
47 # Remove the annotation_file from disk since both
|
|
48 # cascade and sort tables are built using the file,
|
|
49 # and it is opened for writing in append mode.
|
|
50 os.remove(annotation_file)
|
|
51 # Process the data.
|
|
52 table_df_transposed = table_df.T
|
|
53 table_df_transposed.index = table_df_transposed.index.rename('index')
|
|
54 table_df_transposed = table_df_transposed.merge(annotations_df, left_index=True, right_index=True)
|
|
55 table_df = table_df_transposed.T
|
|
56 return table_df
|
|
57
|
|
58
|
|
59 def excel_formatter(json_file_name, excel_file_name, group, annotation_dict):
|
|
60 pandas.io.formats.excel.header_style = None
|
|
61 table_df = pandas.read_json(json_file_name, orient='split')
|
|
62 if annotation_dict is not None:
|
|
63 table_df = annotate_table(table_df, group, annotation_dict)
|
|
64 else:
|
|
65 table_df = table_df.append(pandas.Series(name='no annotations'))
|
|
66 writer = pandas.ExcelWriter(excel_file_name, engine='xlsxwriter')
|
|
67 table_df.to_excel(writer, sheet_name='Sheet1')
|
|
68 writer_book = writer.book
|
|
69 ws = writer.sheets['Sheet1']
|
|
70 format_a = writer_book.add_format({'bg_color': '#58FA82'})
|
|
71 format_g = writer_book.add_format({'bg_color': '#F7FE2E'})
|
|
72 format_c = writer_book.add_format({'bg_color': '#0000FF'})
|
|
73 format_t = writer_book.add_format({'bg_color': '#FF0000'})
|
|
74 format_normal = writer_book.add_format({'bg_color': '#FDFEFE'})
|
|
75 formatlowqual = writer_book.add_format({'font_color': '#C70039', 'bg_color': '#E2CFDD'})
|
|
76 format_ambigous = writer_book.add_format({'font_color': '#C70039', 'bg_color': '#E2CFDD'})
|
|
77 format_n = writer_book.add_format({'bg_color': '#E2CFDD'})
|
|
78 rows, cols = table_df.shape
|
|
79 ws.set_column(0, 0, 30)
|
|
80 ws.set_column(1, cols, 2.1)
|
|
81 ws.freeze_panes(2, 1)
|
|
82 format_annotation = writer_book.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
|
|
83 # Set last row.
|
|
84 ws.set_row(rows + 1, cols + 1, format_annotation)
|
|
85 # Make sure that row/column locations don't overlap.
|
|
86 ws.conditional_format(rows - 2, 1, rows - 1, cols, {'type': 'cell', 'criteria': '<', 'value': 55, 'format': formatlowqual})
|
|
87 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'cell', 'criteria': '==', 'value': 'B$2', 'format': format_normal})
|
|
88 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'A', 'format': format_a})
|
|
89 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'G', 'format': format_g})
|
|
90 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'C', 'format': format_c})
|
|
91 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'T', 'format': format_t})
|
|
92 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'S', 'format': format_ambigous})
|
|
93 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'Y', 'format': format_ambigous})
|
|
94 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'R', 'format': format_ambigous})
|
|
95 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'W', 'format': format_ambigous})
|
|
96 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'K', 'format': format_ambigous})
|
|
97 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'M', 'format': format_ambigous})
|
|
98 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': 'N', 'format': format_n})
|
|
99 ws.conditional_format(2, 1, rows - 2, cols, {'type': 'text', 'criteria': 'containing', 'value': '-', 'format': format_n})
|
|
100 format_rotation = writer_book.add_format({})
|
|
101 format_rotation.set_rotation(90)
|
|
102 for column_num, column_name in enumerate(list(table_df.columns)):
|
|
103 ws.write(0, column_num + 1, column_name, format_rotation)
|
|
104 format_annotation = writer_book.add_format({'font_color': '#0A028C', 'rotation': '-90', 'align': 'top'})
|
|
105 # Set last row.
|
|
106 ws.set_row(rows, 400, format_annotation)
|
|
107 writer.save()
|
|
108
|
|
109
|
|
110 def get_annotation_dict(gbk_file):
|
|
111 gbk_dict = SeqIO.to_dict(SeqIO.parse(gbk_file, "genbank"))
|
|
112 annotation_dict = {}
|
|
113 tmp_file = "features.csv"
|
|
114 # Create a file of chromosomes and features.
|
|
115 for chromosome in list(gbk_dict.keys()):
|
|
116 with open(tmp_file, 'w+') as fh:
|
|
117 for feature in gbk_dict[chromosome].features:
|
|
118 if "CDS" in feature.type or "rRNA" in feature.type:
|
|
119 try:
|
|
120 product = feature.qualifiers['product'][0]
|
|
121 except KeyError:
|
|
122 product = None
|
|
123 try:
|
|
124 locus = feature.qualifiers['locus_tag'][0]
|
|
125 except KeyError:
|
|
126 locus = None
|
|
127 try:
|
|
128 gene = feature.qualifiers['gene'][0]
|
|
129 except KeyError:
|
|
130 gene = None
|
|
131 fh.write("%s\t%d\t%d\t%s\t%s\t%s\n" % (chromosome, int(feature.location.start), int(feature.location.end), locus, product, gene))
|
|
132 # Read the chromosomes and features file into a data frame.
|
|
133 df = pandas.read_csv(tmp_file, sep='\t', names=["chrom", "start", "stop", "locus", "product", "gene"])
|
|
134 # Process the data.
|
|
135 df = df.sort_values(['start', 'gene'], ascending=[True, False])
|
|
136 df = df.drop_duplicates('start')
|
|
137 pro = df.reset_index(drop=True)
|
|
138 pro.index = pandas.IntervalIndex.from_arrays(pro['start'], pro['stop'], closed='both')
|
|
139 annotation_dict[chromosome] = pro
|
|
140 return annotation_dict
|
|
141
|
|
142
|
3
|
143 def get_sample_name(file_path):
|
0
|
144 base_file_name = os.path.basename(file_path)
|
|
145 if base_file_name.find(".") > 0:
|
|
146 # Eliminate the extension.
|
|
147 return os.path.splitext(base_file_name)[0]
|
3
|
148 return base_file_name
|
0
|
149
|
|
150
|
|
151 def output_cascade_table(cascade_order, mqdf, group, annotation_dict):
|
|
152 cascade_order_mq = pandas.concat([cascade_order, mqdf], join='inner')
|
|
153 output_table(cascade_order_mq, "cascade", group, annotation_dict)
|
|
154
|
|
155
|
|
156 def output_excel(df, type_str, group, annotation_dict, count=None):
|
|
157 # Output the temporary json file that
|
|
158 # is used by the excel_formatter.
|
|
159 if count is None:
|
|
160 if group is None:
|
3
|
161 json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq.json" % type_str)
|
0
|
162 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table.xlsx" % type_str)
|
|
163 else:
|
3
|
164 json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq.json" % (group, type_str))
|
0
|
165 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table.xlsx" % (group, type_str))
|
|
166 else:
|
3
|
167 # The table has more columns than is allowed by the
|
|
168 # MAXCOLS setting, so multiple files will be produced
|
|
169 # as an output collection.
|
0
|
170 if group is None:
|
3
|
171 json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_order_mq_%d.json" % (type_str, count))
|
0
|
172 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_table_%d.xlsx" % (type_str, count))
|
|
173 else:
|
3
|
174 json_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_order_mq_%d.json" % (group, type_str, count))
|
0
|
175 excel_file_name = os.path.join(OUTPUT_EXCEL_DIR, "%s_%s_table_%d.xlsx" % (group, type_str, count))
|
|
176 df.to_json(json_file_name, orient='split')
|
|
177 # Output the Excel file.
|
|
178 excel_formatter(json_file_name, excel_file_name, group, annotation_dict)
|
|
179
|
|
180
|
|
181 def output_sort_table(cascade_order, mqdf, group, annotation_dict):
|
|
182 sort_df = cascade_order.T
|
|
183 sort_df['abs_value'] = sort_df.index
|
|
184 sort_df[['chrom', 'pos']] = sort_df['abs_value'].str.split(':', expand=True)
|
|
185 sort_df = sort_df.drop(['abs_value', 'chrom'], axis=1)
|
|
186 sort_df.pos = sort_df.pos.astype(int)
|
|
187 sort_df = sort_df.sort_values(by=['pos'])
|
|
188 sort_df = sort_df.drop(['pos'], axis=1)
|
|
189 sort_df = sort_df.T
|
|
190 sort_order_mq = pandas.concat([sort_df, mqdf], join='inner')
|
|
191 output_table(sort_order_mq, "sort", group, annotation_dict)
|
|
192
|
|
193
|
|
194 def output_table(df, type_str, group, annotation_dict):
|
|
195 if isinstance(group, str) and group.startswith("dataset"):
|
|
196 # Inputs are single files, not collections,
|
|
197 # so input file names are not useful for naming
|
|
198 # output files.
|
|
199 group_str = None
|
|
200 else:
|
|
201 group_str = group
|
|
202 count = 0
|
|
203 chunk_start = 0
|
|
204 chunk_end = 0
|
|
205 column_count = df.shape[1]
|
|
206 if column_count >= MAXCOLS:
|
|
207 # Here the number of columns is greater than
|
|
208 # the maximum allowed by Excel, so multiple
|
|
209 # outputs will be produced.
|
|
210 while column_count >= MAXCOLS:
|
|
211 count += 1
|
|
212 chunk_end += MAXCOLS
|
|
213 df_of_type = df.iloc[:, chunk_start:chunk_end]
|
|
214 output_excel(df_of_type, type_str, group_str, annotation_dict, count=count)
|
|
215 chunk_start += MAXCOLS
|
|
216 column_count -= MAXCOLS
|
|
217 count += 1
|
|
218 df_of_type = df.iloc[:, chunk_start:]
|
|
219 output_excel(df_of_type, type_str, group_str, annotation_dict, count=count)
|
|
220 else:
|
|
221 output_excel(df, type_str, group_str, annotation_dict)
|
|
222
|
|
223
|
3
|
224 def preprocess_tables(newick_file, json_file, json_avg_mq_file, annotation_dict):
|
|
225 avg_mq_series = pandas.read_json(json_avg_mq_file, typ='series', orient='split')
|
|
226 # Map quality to dataframe.
|
|
227 mqdf = avg_mq_series.to_frame(name='MQ')
|
|
228 mqdf = mqdf.T
|
|
229 # Get the group.
|
|
230 group = get_sample_name(newick_file)
|
|
231 snps_df = pandas.read_json(json_file, orient='split')
|
|
232 with open(newick_file, 'r') as fh:
|
|
233 for line in fh:
|
|
234 line = re.sub('[:,]', '\n', line)
|
|
235 line = re.sub('[)(]', '', line)
|
|
236 line = re.sub(r'[0-9].*\.[0-9].*\n', '', line)
|
|
237 line = re.sub('root\n', '', line)
|
|
238 sample_order = line.split('\n')
|
|
239 sample_order = list([_f for _f in sample_order if _f])
|
|
240 sample_order.insert(0, 'root')
|
|
241 tree_order = snps_df.loc[sample_order]
|
|
242 # Count number of SNPs in each column.
|
|
243 snp_per_column = []
|
|
244 for column_header in tree_order:
|
|
245 count = 0
|
|
246 column = tree_order[column_header]
|
|
247 for element in column:
|
|
248 if element != column[0]:
|
|
249 count = count + 1
|
|
250 snp_per_column.append(count)
|
|
251 row1 = pandas.Series(snp_per_column, tree_order.columns, name="snp_per_column")
|
|
252 # Count number of SNPS from the
|
|
253 # top of each column in the table.
|
|
254 snp_from_top = []
|
|
255 for column_header in tree_order:
|
|
256 count = 0
|
|
257 column = tree_order[column_header]
|
|
258 # for each element in the column
|
|
259 # skip the first element
|
|
260 for element in column[1:]:
|
|
261 if element == column[0]:
|
|
262 count = count + 1
|
|
263 else:
|
|
264 break
|
|
265 snp_from_top.append(count)
|
|
266 row2 = pandas.Series(snp_from_top, tree_order.columns, name="snp_from_top")
|
|
267 tree_order = tree_order.append([row1])
|
|
268 tree_order = tree_order.append([row2])
|
|
269 # In pandas=0.18.1 even this does not work:
|
|
270 # abc = row1.to_frame()
|
|
271 # abc = abc.T --> tree_order.shape (5, 18), abc.shape (1, 18)
|
|
272 # tree_order.append(abc)
|
|
273 # Continue to get error: "*** ValueError: all the input arrays must have same number of dimensions"
|
|
274 tree_order = tree_order.T
|
|
275 tree_order = tree_order.sort_values(['snp_from_top', 'snp_per_column'], ascending=[True, False])
|
|
276 tree_order = tree_order.T
|
|
277 # Remove snp_per_column and snp_from_top rows.
|
|
278 cascade_order = tree_order[:-2]
|
|
279 # Output the cascade table.
|
|
280 output_cascade_table(cascade_order, mqdf, group, annotation_dict)
|
|
281 # Output the sorted table.
|
|
282 output_sort_table(cascade_order, mqdf, group, annotation_dict)
|
0
|
283
|
|
284
|
|
285 if __name__ == '__main__':
|
|
286 parser = argparse.ArgumentParser()
|
|
287
|
|
288 parser.add_argument('--gbk_file', action='store', dest='gbk_file', required=False, default=None, help='Optional gbk file'),
|
3
|
289 parser.add_argument('--input_avg_mq_json', action='store', dest='input_avg_mq_json', help='Average MQ json file')
|
|
290 parser.add_argument('--input_newick', action='store', dest='input_newick', help='Newick file')
|
|
291 parser.add_argument('--input_snps_json', action='store', dest='input_snps_json', help='SNPs json file')
|
0
|
292
|
|
293 args = parser.parse_args()
|
|
294
|
|
295 if args.gbk_file is not None:
|
|
296 # Create the annotation_dict for annotating
|
|
297 # the Excel tables.
|
|
298 annotation_dict = get_annotation_dict(args.gbk_file)
|
|
299 else:
|
|
300 annotation_dict = None
|
|
301
|
3
|
302 preprocess_tables(args.input_newick, args.input_snps_json, args.input_avg_mq_json, annotation_dict)
|