comparison flowstatlib.py @ 1:b5453d07f740 draft default tip

"planemo upload for repository https://github.com/ImmPortDB/immport-galaxy-tools/tree/master/flowtools/flow_overview commit 65373effef15809f3db0e5f9603ef808f4110aa3"
author azomics
date Wed, 29 Jul 2020 17:03:53 -0400
parents
children
comparison
equal deleted inserted replaced
0:8283ff163ba6 1:b5453d07f740
1 ######################################################################
2 # Copyright (c) 2016 Northrop Grumman.
3 # All rights reserved.
4 ######################################################################
5 import pandas as pd
6 from scipy.stats import gmean
7 from argparse import ArgumentParser
8
9
10 def gen_overview_stats(file_name):
11 flow_stats = {}
12 fcs = pd.read_table(file_name)
13 (events, columns) = fcs.shape
14 flow_stats['fcs'] = fcs
15 flow_stats['events'] = events
16 flow_stats['columns'] = columns - 1
17 flow_stats['data'] = fcs.iloc[:, :-1]
18 flow_stats['population'] = fcs.iloc[:, -1:].iloc[:, 0]
19 flow_stats['population_freq'] = flow_stats['population'].value_counts()
20 flow_stats['population_sample'] = (flow_stats['population_freq'] * (20000/float(events))).round(decimals=0)
21 flow_stats['population_freq_sort'] = flow_stats['population_freq'].sort_index()
22 flow_stats['population_per'] = (flow_stats['population'].value_counts(normalize=True) * 100).round(decimals=2)
23 flow_stats['population_per_sort'] = flow_stats['population_per'].sort_index()
24 flow_stats['population_all'] = pd.concat([flow_stats['population_freq_sort'], flow_stats['population_per_sort']], axis=1)
25 flow_stats['population_all'].columns = ['Count', 'Percentage']
26 flow_stats['min'] = flow_stats['data'].values.min()
27 flow_stats['max'] = flow_stats['data'].values.max()
28 flow_stats['markers'] = list(flow_stats['data'].columns)
29 flow_stats['mfi'] = fcs.groupby('Population').mean().round(decimals=2)
30 flow_stats['mfi_pop'] = pd.merge(flow_stats['mfi'], flow_stats['population_all'], left_index=True, right_index=True)
31 flow_stats['mfi_pop']['Population'] = flow_stats['mfi_pop'].index
32 flow_stats['gmfi'] = fcs.groupby('Population').agg(lambda x: gmean(list(x))).round(decimals=2)
33 flow_stats['gmfi_pop'] = pd.merge(flow_stats['gmfi'], flow_stats['population_all'], left_index=True, right_index=True)
34 flow_stats['gmfi_pop']['Population'] = flow_stats['gmfi_pop'].index
35 flow_stats['mdfi'] = fcs.groupby('Population').median().round(decimals=2)
36 flow_stats['mdfi_pop'] = pd.merge(flow_stats['mdfi'], flow_stats['population_all'], left_index=True, right_index=True)
37 flow_stats['mdfi_pop']['Population'] = flow_stats['mdfi_pop'].index
38
39 #
40 # If the number of events is less than 20000, then return
41 # the complete data set,
42 # Otherwise sample the data to only return 20000 events.
43 if events <= 20000:
44 flow_stats['sample'] = fcs
45 else:
46 fcs_np = fcs.values
47 sample_data = []
48 pop_found = {}
49 for i in range(0, events):
50 population_number = fcs_np[i][columns-1]
51 if population_number in pop_found:
52 if pop_found[population_number] < flow_stats['population_sample'][population_number]:
53 pop_found[population_number] += 1
54 sample_data.append(fcs_np[i])
55 else:
56 pop_found[population_number] = 1
57 sample_data.append(fcs_np[i])
58 flow_stats['sample'] = pd.DataFrame(sample_data)
59 flow_stats['sample'].columns = fcs.columns
60
61 flow_stats['sample_data'] = flow_stats['sample'].iloc[:, :-1]
62 flow_stats['sample_population'] = flow_stats['sample'].iloc[:, -1:].iloc[:, 0]
63
64 return flow_stats
65
66
67 if __name__ == '__main__':
68 parser = ArgumentParser(
69 prog="flowstats",
70 description="Gets statistics on FLOCK run")
71
72 parser.add_argument(
73 '-i',
74 dest="input_file",
75 required=True,
76 help="File locations for flow clr file.")
77
78 parser.add_argument(
79 '-o',
80 dest="out_file",
81 required=True,
82 help="Path to the directory for the output file.")
83 args = parser.parse_args()
84
85 flow_stats = gen_overview_stats(args.input_file)
86 with open(args.out_file, "w") as outf:
87 outf.write("Events: ", flow_stats['events'])
88 outf.write("Min: ", flow_stats['min'])
89 outf.write("Max: ", flow_stats['max'])
90 outf.write("Columns: ", flow_stats['columns'])
91 outf.write("Markers: ", flow_stats['markers'])
92 outf.write("Population: ", flow_stats['population'])
93 outf.write("Population Freq: ", flow_stats['population_freq'])
94 outf.write("Population Sample: ", flow_stats['population_sample'])
95 outf.write("Population Per: ", flow_stats['population_per'])
96 outf.write("Sample Data contains ", len(flow_stats['sample']), " events")
97 outf.write("MIF_POP ", flow_stats['mfi_pop'])