Mercurial > repos > recetox > target_screen
comparison target_screen.py @ 1:6d51be3d7bb5 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/misc commit d6102c60e41d91adf1c7a876f84ef420a69262e2
| author | recetox |
|---|---|
| date | Mon, 12 May 2025 14:05:37 +0000 |
| parents | d4c2d5bc0524 |
| children |
comparison
equal
deleted
inserted
replaced
| 0:d4c2d5bc0524 | 1:6d51be3d7bb5 |
|---|---|
| 1 import argparse | 1 import argparse |
| 2 from typing import Tuple | |
| 2 | 3 |
| 3 import numpy as np | 4 import numpy as np |
| 4 import pandas as pd | 5 import pandas as pd |
| 5 | 6 |
| 6 | 7 |
| 7 def mz_match(marker, peak, ppm): | 8 class LoadDataAction(argparse.Action): |
| 9 """ | |
| 10 Custom argparse action to load data from a file into a pandas DataFrame. | |
| 11 Supports CSV, TSV, and Parquet file formats. | |
| 12 """ | |
| 13 def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Tuple[str, str], option_string: str = None) -> None: | |
| 14 file_path, file_extension = values | |
| 15 file_extension = file_extension.lower() | |
| 16 if file_extension == "csv": | |
| 17 df = pd.read_csv(file_path) | |
| 18 elif file_extension in ["tsv", "tabular"]: | |
| 19 df = pd.read_csv(file_path, sep="\t") | |
| 20 elif file_extension == "parquet": | |
| 21 df = pd.read_parquet(file_path) | |
| 22 else: | |
| 23 raise ValueError(f"Unsupported file format: {file_extension}") | |
| 24 setattr(namespace, self.dest, df) | |
| 25 | |
| 26 | |
| 27 def mz_match(marker: np.ndarray, peak: np.ndarray, ppm: int) -> np.ndarray: | |
| 28 """ | |
| 29 Check if the mass-to-charge ratio (m/z) of markers and peaks match within a given PPM tolerance. | |
| 30 | |
| 31 Args: | |
| 32 marker (np.ndarray): Array of marker m/z values. | |
| 33 peak (np.ndarray): Array of peak m/z values. | |
| 34 ppm (int): PPM tolerance for matching. | |
| 35 | |
| 36 Returns: | |
| 37 np.ndarray: Boolean array indicating matches. | |
| 38 """ | |
| 8 return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 | 39 return np.abs(marker - peak) <= ((peak + marker) / 2) * ppm * 1e-06 |
| 9 | 40 |
| 10 | 41 |
| 11 def rt_match(marker, peak, tol): | 42 def rt_match(marker: np.ndarray, peak: np.ndarray, tol: int) -> np.ndarray: |
| 43 """ | |
| 44 Check if the retention time (rt) of markers and peaks match within a given tolerance. | |
| 45 | |
| 46 Args: | |
| 47 marker (np.ndarray): Array of marker retention times. | |
| 48 peak (np.ndarray): Array of peak retention times. | |
| 49 tol (int): Retention time tolerance for matching. | |
| 50 | |
| 51 Returns: | |
| 52 np.ndarray: Boolean array indicating matches. | |
| 53 """ | |
| 12 return np.abs(marker - peak) <= tol | 54 return np.abs(marker - peak) <= tol |
| 13 | 55 |
| 14 | 56 |
| 15 def find_matches(peaks, markers, ppm, rt_tol): | 57 def find_matches(peaks: pd.DataFrame, markers: pd.DataFrame, ppm: int, rt_tol: int) -> pd.DataFrame: |
| 58 """ | |
| 59 Find matches between peaks and markers based on m/z and retention time tolerances. | |
| 60 | |
| 61 Args: | |
| 62 peaks (pd.DataFrame): DataFrame containing peak data with 'mz' and 'rt' columns. | |
| 63 markers (pd.DataFrame): DataFrame containing marker data with 'mz' and 'rt' columns. | |
| 64 ppm (int): PPM tolerance for m/z matching. | |
| 65 rt_tol (int): Retention time tolerance for rt matching. | |
| 66 | |
| 67 Returns: | |
| 68 pd.DataFrame: DataFrame containing matched rows with all columns from peaks and markers. | |
| 69 """ | |
| 16 # Create a meshgrid of all combinations of mz and rt values | 70 # Create a meshgrid of all combinations of mz and rt values |
| 17 marker_mz = markers['mz'].values[:, np.newaxis] | 71 marker_mz = markers['mz'].values[:, np.newaxis] |
| 18 peak_mz = peaks['mz'].values | 72 peak_mz = peaks['mz'].values |
| 19 marker_rt = markers['rt'].values[:, np.newaxis] | 73 marker_rt = markers['rt'].values[:, np.newaxis] |
| 20 peak_rt = peaks['rt'].values | 74 peak_rt = peaks['rt'].values |
| 27 match_indices = np.where(mz_matches & rt_matches) | 81 match_indices = np.where(mz_matches & rt_matches) |
| 28 | 82 |
| 29 # Create a DataFrame of hits | 83 # Create a DataFrame of hits |
| 30 matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) | 84 matched_markers = markers.iloc[match_indices[0]].reset_index(drop=True) |
| 31 matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) | 85 matched_peaks = peaks.iloc[match_indices[1]].reset_index(drop=True) |
| 32 hits = pd.concat([matched_markers[['formula']].reset_index(drop=True), matched_peaks], axis=1) | |
| 33 | 86 |
| 34 # Calculate mz and rt differences | 87 # Calculate mz and rt differences |
| 35 hits['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) | 88 matched_markers['mz_diff'] = np.abs(matched_markers['mz'].values - matched_peaks['mz'].values) |
| 36 hits['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) | 89 matched_markers['rt_diff'] = np.abs(matched_markers['rt'].values - matched_peaks['rt'].values) |
| 37 | 90 |
| 91 # Drop mz and rt columns from the marker table | |
| 92 matched_markers = matched_markers.drop(columns=['mz', 'rt']) | |
| 93 | |
| 94 # Combine all columns from peaks and markers | |
| 95 hits = pd.concat([matched_markers.reset_index(drop=True), matched_peaks.reset_index(drop=True)], axis=1) | |
| 38 return hits | 96 return hits |
| 39 | 97 |
| 40 | 98 |
| 41 def main(): | 99 def main() -> None: |
| 100 """ | |
| 101 Main function to parse arguments, find matches between peaks and markers, and save the results. | |
| 102 """ | |
| 42 parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') | 103 parser = argparse.ArgumentParser(description='Find matches between peaks and markers.') |
| 43 parser.add_argument('--peaks', required=True, help='Path to the peaks parquet file.') | 104 parser.add_argument('--peaks', required=True, nargs=2, action=LoadDataAction, help='Path to the peaks file and its format (e.g., "file.parquet parquet").') |
| 44 parser.add_argument('--markers', required=True, help='Path to the markers CSV file.') | 105 parser.add_argument('--markers', required=True, nargs=2, action=LoadDataAction, help='Path to the markers file and its format (e.g., "file.tsv tsv").') |
| 45 parser.add_argument('--output', required=True, help='Path to the output TSV file.') | 106 parser.add_argument('--output', required=True, help='Path to the output TSV file.') |
| 46 parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') | 107 parser.add_argument('--ppm', type=int, default=5, help='PPM tolerance for mz matching.') |
| 47 parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') | 108 parser.add_argument('--rt_tol', type=int, default=10, help='RT tolerance for rt matching.') |
| 48 args = parser.parse_args() | 109 args = parser.parse_args() |
| 49 | 110 |
| 50 peaks = pd.read_parquet(args.peaks) | 111 hits = find_matches(args.peaks, args.markers, args.ppm, args.rt_tol) |
| 51 markers = pd.read_csv(args.markers, sep='\t') | |
| 52 | |
| 53 hits = find_matches(peaks, markers, args.ppm, args.rt_tol) | |
| 54 | 112 |
| 55 hits.to_csv(args.output, sep='\t', index=False) | 113 hits.to_csv(args.output, sep='\t', index=False) |
| 56 | 114 |
| 57 | 115 |
| 58 if __name__ == "__main__": | 116 if __name__ == "__main__": |
