Mercurial > repos > jay > gaiac_regression_plot
comparison gaiac_outlier_removal/gaiac_outlier_removal.py @ 0:0a8233db930e draft
planemo upload for repository https://github.com/jaidevjoshi83/gaiac.git commit c29a769ed165f313a6410925be24f776652a9663-dirty
| author | jay |
|---|---|
| date | Thu, 15 May 2025 14:46:28 +0000 |
| parents | |
| children |
comparison
equal
deleted
inserted
replaced
| -1:000000000000 | 0:0a8233db930e |
|---|---|
| 1 import numpy as np | |
| 2 import pandas as pd | |
| 3 | |
| 4 # python 'outlier_removal.py' -I '' -M 'replace' -QU '75' -QL '25' -MU '1.5' | |
| 5 | |
| 6 | |
| 7 def AddMedian(input_data, column_list, out_file, method='drop', Q_UP=75, Q_DOWN=25, Multiplier=1.5, sep='\t'): | |
| 8 df = pd.read_csv(input_data, sep=sep) | |
| 9 cl = df.columns.tolist() | |
| 10 | |
| 11 clms = [cl[int(x)-1] for x in column_list.split(',')] | |
| 12 | |
| 13 Q_UP = float(Q_UP) | |
| 14 Q_DOWN = float(Q_DOWN) | |
| 15 Multiplier = float(Multiplier) | |
| 16 | |
| 17 if method == 'replace': | |
| 18 for col in clms: | |
| 19 q75, q25 = np.percentile(df[col], [Q_UP, Q_DOWN]) | |
| 20 intr_qr = q75 - q25 | |
| 21 upper_bound = q75 + (Multiplier * intr_qr) | |
| 22 lower_bound = q25 - (Multiplier * intr_qr) | |
| 23 | |
| 24 median_val = np.median(df[col]) | |
| 25 df.loc[df[col] < lower_bound, col] = median_val | |
| 26 df.loc[df[col] > upper_bound, col] = median_val | |
| 27 | |
| 28 elif method == "drop": | |
| 29 # compute bounds for each column | |
| 30 for col in clms: | |
| 31 Q1 = np.percentile(df[col], 25, interpolation='midpoint') | |
| 32 Q3 = np.percentile(df[col], 75, interpolation='midpoint') | |
| 33 IQR = Q3 - Q1 | |
| 34 | |
| 35 upper_bound = Q3 + (Multiplier * IQR) | |
| 36 lower_bound = Q1 - (Multiplier * IQR) | |
| 37 | |
| 38 # drop rows where col value is an outlier | |
| 39 df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)] | |
| 40 | |
| 41 else: | |
| 42 raise ValueError("Invalid method. Choose 'drop' or 'replace'.") | |
| 43 | |
| 44 df.to_csv(out_file, sep="\t", index=None) | |
| 45 | |
| 46 | |
| 47 if __name__ == "__main__": | |
| 48 import argparse | |
| 49 | |
| 50 parser = argparse.ArgumentParser(description="Outlier removal or replacement tool") | |
| 51 | |
| 52 parser.add_argument("-I", "--infile", required=True, help="Path to input TSV file") | |
| 53 parser.add_argument("-C", "--column_list", required=True, help="Comma-separated list of 1-based column numbers to process") | |
| 54 parser.add_argument("-O", "--outfile", required=True, help="Output TSV file path") | |
| 55 parser.add_argument("-M", "--method", required=True, choices=["drop", "replace"], help="Select whether to 'drop' outliers or 'replace' with median") | |
| 56 parser.add_argument("-QU", "--upper_quartile", default=75, help="Upper quartile value (default 75)") | |
| 57 parser.add_argument("-QL", "--lower_quartile", default=25, help="Lower quartile value (default 25)") | |
| 58 parser.add_argument("-MU", "--multiplier_constant", default=1.5, help="IQR multiplier constant (default 1.5)") | |
| 59 parser.add_argument("-S", "--sep", default='\t', help="deliminator") | |
| 60 | |
| 61 args = parser.parse_args() | |
| 62 | |
| 63 AddMedian(args.infile, args.column_list, args.outfile, args.method, args.upper_quartile, args.lower_quartile, args.multiplier_constant, args.sep) |
