annotate gaiac_outlier_removal/gaiac_outlier_removal.py @ 2:738f9ecd4dd1 draft

planemo upload for repository https://github.com/jaidevjoshi83/gaiac commit c31ef5726cd1d2659da9aeb956c22fb834b177ff
author jay
date Fri, 23 May 2025 17:54:48 +0000
parents 1d05627d399f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
1 import numpy as np
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
2 import pandas as pd
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
3
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
4 # python 'outlier_removal.py' -I '' -M 'replace' -QU '75' -QL '25' -MU '1.5'
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
5
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
6
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
7 def AddMedian(input_data, column_list, out_file, method='drop', Q_UP=75, Q_DOWN=25, Multiplier=1.5, sep='\t'):
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
8 df = pd.read_csv(input_data, sep=sep)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
9 cl = df.columns.tolist()
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
10
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
11 clms = [cl[int(x)-1] for x in column_list.split(',')]
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
12
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
13 Q_UP = float(Q_UP)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
14 Q_DOWN = float(Q_DOWN)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
15 Multiplier = float(Multiplier)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
16
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
17 if method == 'replace':
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
18 for col in clms:
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
19 q75, q25 = np.percentile(df[col], [Q_UP, Q_DOWN])
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
20 intr_qr = q75 - q25
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
21 upper_bound = q75 + (Multiplier * intr_qr)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
22 lower_bound = q25 - (Multiplier * intr_qr)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
23
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
24 median_val = np.median(df[col])
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
25 df.loc[df[col] < lower_bound, col] = median_val
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
26 df.loc[df[col] > upper_bound, col] = median_val
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
27
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
28 elif method == "drop":
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
29 # compute bounds for each column
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
30 for col in clms:
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
31 Q1 = np.percentile(df[col], 25, interpolation='midpoint')
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
32 Q3 = np.percentile(df[col], 75, interpolation='midpoint')
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
33 IQR = Q3 - Q1
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
34
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
35 upper_bound = Q3 + (Multiplier * IQR)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
36 lower_bound = Q1 - (Multiplier * IQR)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
37
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
38 # drop rows where col value is an outlier
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
39 df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
40
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
41 else:
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
42 raise ValueError("Invalid method. Choose 'drop' or 'replace'.")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
43
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
44 df.to_csv(out_file, sep="\t", index=None)
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
45
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
46
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
47 if __name__ == "__main__":
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
48 import argparse
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
49
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
50 parser = argparse.ArgumentParser(description="Outlier removal or replacement tool")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
51
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
52 parser.add_argument("-I", "--infile", required=True, help="Path to input TSV file")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
53 parser.add_argument("-C", "--column_list", required=True, help="Comma-separated list of 1-based column numbers to process")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
54 parser.add_argument("-O", "--outfile", required=True, help="Output TSV file path")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
55 parser.add_argument("-M", "--method", required=True, choices=["drop", "replace"], help="Select whether to 'drop' outliers or 'replace' with median")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
56 parser.add_argument("-QU", "--upper_quartile", default=75, help="Upper quartile value (default 75)")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
57 parser.add_argument("-QL", "--lower_quartile", default=25, help="Lower quartile value (default 25)")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
58 parser.add_argument("-MU", "--multiplier_constant", default=1.5, help="IQR multiplier constant (default 1.5)")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
59 parser.add_argument("-S", "--sep", default='\t', help="deliminator")
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
60
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
61 args = parser.parse_args()
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
62
1d05627d399f planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff changeset
63 AddMedian(args.infile, args.column_list, args.outfile, args.method, args.upper_quartile, args.lower_quartile, args.multiplier_constant, args.sep)