Mercurial > repos > jay > gaiac_regression_plot
annotate gaiac_outlier_removal/gaiac_outlier_removal.py @ 3:2ae74925a4fe draft default tip
planemo upload for repository https://github.com/jaidevjoshi83/gaiac commit e9587f93346c7b55e1be00bad5844bf2db3ed03d-dirty
author | jay |
---|---|
date | Thu, 10 Jul 2025 19:40:59 +0000 |
parents | 0a8233db930e |
children |
rev | line source |
---|---|
0
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
1 import numpy as np |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
2 import pandas as pd |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
3 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
4 # python 'outlier_removal.py' -I '' -M 'replace' -QU '75' -QL '25' -MU '1.5' |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
5 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
6 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
7 def AddMedian(input_data, column_list, out_file, method='drop', Q_UP=75, Q_DOWN=25, Multiplier=1.5, sep='\t'): |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
8 df = pd.read_csv(input_data, sep=sep) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
9 cl = df.columns.tolist() |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
10 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
11 clms = [cl[int(x)-1] for x in column_list.split(',')] |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
12 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
13 Q_UP = float(Q_UP) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
14 Q_DOWN = float(Q_DOWN) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
15 Multiplier = float(Multiplier) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
16 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
17 if method == 'replace': |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
18 for col in clms: |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
19 q75, q25 = np.percentile(df[col], [Q_UP, Q_DOWN]) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
20 intr_qr = q75 - q25 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
21 upper_bound = q75 + (Multiplier * intr_qr) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
22 lower_bound = q25 - (Multiplier * intr_qr) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
23 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
24 median_val = np.median(df[col]) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
25 df.loc[df[col] < lower_bound, col] = median_val |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
26 df.loc[df[col] > upper_bound, col] = median_val |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
27 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
28 elif method == "drop": |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
29 # compute bounds for each column |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
30 for col in clms: |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
31 Q1 = np.percentile(df[col], 25, interpolation='midpoint') |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
32 Q3 = np.percentile(df[col], 75, interpolation='midpoint') |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
33 IQR = Q3 - Q1 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
34 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
35 upper_bound = Q3 + (Multiplier * IQR) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
36 lower_bound = Q1 - (Multiplier * IQR) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
37 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
38 # drop rows where col value is an outlier |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
39 df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)] |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
40 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
41 else: |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
42 raise ValueError("Invalid method. Choose 'drop' or 'replace'.") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
43 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
44 df.to_csv(out_file, sep="\t", index=None) |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
45 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
46 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
47 if __name__ == "__main__": |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
48 import argparse |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
49 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
50 parser = argparse.ArgumentParser(description="Outlier removal or replacement tool") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
51 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
52 parser.add_argument("-I", "--infile", required=True, help="Path to input TSV file") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
53 parser.add_argument("-C", "--column_list", required=True, help="Comma-separated list of 1-based column numbers to process") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
54 parser.add_argument("-O", "--outfile", required=True, help="Output TSV file path") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
55 parser.add_argument("-M", "--method", required=True, choices=["drop", "replace"], help="Select whether to 'drop' outliers or 'replace' with median") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
56 parser.add_argument("-QU", "--upper_quartile", default=75, help="Upper quartile value (default 75)") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
57 parser.add_argument("-QL", "--lower_quartile", default=25, help="Lower quartile value (default 25)") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
58 parser.add_argument("-MU", "--multiplier_constant", default=1.5, help="IQR multiplier constant (default 1.5)") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
59 parser.add_argument("-S", "--sep", default='\t', help="deliminator") |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
60 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
61 args = parser.parse_args() |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
62 |
0a8233db930e
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
jay
parents:
diff
changeset
|
63 AddMedian(args.infile, args.column_list, args.outfile, args.method, args.upper_quartile, args.lower_quartile, args.multiplier_constant, args.sep) |