Mercurial > repos > recetox > matchms_formatter
comparison formatter.py @ 10:1b09315a3f87 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
author | recetox |
---|---|
date | Tue, 27 Jun 2023 14:25:59 +0000 |
parents | 966b4134ad12 |
children | ae45992f969e |
comparison
equal
deleted
inserted
replaced
9:715fe77be601 | 10:1b09315a3f87 |
---|---|
1 import click | 1 import click |
2 from matchms.importing import scores_from_json | 2 from matchms.importing import scores_from_json |
3 from pandas import DataFrame | 3 from pandas import DataFrame |
4 | 4 |
5 | 5 |
6 def create_long_table(data: DataFrame, value_id: str) -> DataFrame: | 6 def scores_to_dataframe(scores): |
7 """Convert the table from compact into long format. | |
8 See DataFrame.melt(...). | |
9 | |
10 Args: | |
11 data (DataFrame): The data table to convert. | |
12 value_id (str): The name to assign to the added column through conversion to long format. | |
13 | |
14 Returns: | |
15 DataFrame: Table in long format. | |
16 """ | |
17 return data.transpose().melt(ignore_index=False, var_name='compound', value_name=value_id) | |
18 | |
19 | |
20 def join_df(x: DataFrame, y: DataFrame, on=[], how="inner") -> DataFrame: | |
21 """Shortcut functions to join to dataframes on columns and index | |
22 | |
23 Args: | |
24 x (DataFrame): Table X | |
25 y (DataFrame): Table Y | |
26 on (list, optional): Columns on which to join. Defaults to []. | |
27 how (str, optional): Join method, see DataFrame.join(...). Defaults to "inner". | |
28 | |
29 Returns: | |
30 DataFrame: Joined dataframe. | |
31 """ | |
32 df_x = x.set_index([x.index] + on) | |
33 df_y = y.set_index([y.index] + on) | |
34 combined = df_x.join(df_y, how=how) | |
35 return combined | |
36 | |
37 | |
38 def get_top_k_matches(data: DataFrame, k: int) -> DataFrame: | |
39 """Function to get top k matches from dataframe with scores. | |
40 | |
41 Args: | |
42 data (DataFrame): A table with score column. | |
43 k (int): Number of top scores to retrieve. | |
44 | |
45 Returns: | |
46 DataFrame: Table containing only the top k best matches for each compound. | |
47 """ | |
48 return data.groupby(level=0, group_keys=False).apply(DataFrame.nlargest, n=k, columns=['score']) | |
49 | |
50 | |
51 def filter_thresholds(data: DataFrame, t_score: float, t_matches: float) -> DataFrame: | |
52 """Filter a dataframe with scores and matches to only contain values above specified thresholds. | |
53 | |
54 Args: | |
55 data (DataFrame): Table to filter. | |
56 t_score (float): Score threshold. | |
57 t_matches (float): Matches threshold. | |
58 | |
59 Returns: | |
60 DataFrame: Filtered dataframe. | |
61 """ | |
62 filtered = data[data['score'] > t_score] | |
63 filtered = filtered[filtered['matches'] > t_matches] | |
64 return filtered | |
65 | |
66 | |
67 def scores_to_dataframes(scores): | |
68 """Unpack scores from matchms.scores into two dataframes of scores and matches. | 7 """Unpack scores from matchms.scores into two dataframes of scores and matches. |
69 | 8 |
70 Args: | 9 Args: |
71 scores (matchms.scores): matchms.scores object. | 10 scores (matchms.scores): matchms.scores object. |
72 | 11 |
73 Returns: | 12 Returns: |
74 DataFrame: Scores | 13 DataFrame: Scores |
75 DataFrame: Matches | 14 DataFrame: Matches |
76 """ | 15 """ |
77 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries] | 16 dataframe = DataFrame(columns=['query', 'reference', *scores.scores.score_names]) |
78 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references] | |
79 | 17 |
80 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) | 18 for i, (row, col) in enumerate(zip(scores.scores.row, scores.scores.col)): |
81 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) | 19 dataframe.loc[i] = [scores.queries[col].metadata['compound_name'], scores.references[row].metadata['compound_name'], *scores.scores.data[i]] |
82 | 20 |
83 return dataframe_scores, dataframe_matches | 21 return dataframe |
84 | 22 |
85 | 23 |
86 def load_data(scores_filename: str) -> DataFrame: | 24 def load_data(scores_filename: str) -> DataFrame: |
87 """Load data from filenames and join on compound id. | 25 """Load data from filenames and join on compound id. |
88 | 26 |
91 | 29 |
92 Returns: | 30 Returns: |
93 DataFrame: Joined dataframe on compounds containing scores and matches in long format. | 31 DataFrame: Joined dataframe on compounds containing scores and matches in long format. |
94 """ | 32 """ |
95 scores = scores_from_json(scores_filename) | 33 scores = scores_from_json(scores_filename) |
96 scores, matches = scores_to_dataframes(scores) | 34 scores = scores_to_dataframe(scores) |
97 | 35 |
98 scores_long = create_long_table(scores, 'score') | 36 return scores |
99 matches_long = create_long_table(matches, 'matches') | |
100 | |
101 combined = join_df(matches_long, scores_long, on=['compound'], how='inner') | |
102 return combined | |
103 | 37 |
104 | 38 |
105 @click.group() | 39 @click.group(invoke_without_command=True) |
106 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) | 40 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) |
107 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) | 41 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) |
108 @click.pass_context | 42 def cli(scores_filename, output_filename): |
109 def cli(ctx, scores_filename, output_filename): | 43 result = load_data(scores_filename) |
110 ctx.ensure_object(dict) | 44 result.to_csv(output_filename, sep="\t", index=False) |
111 ctx.obj['data'] = load_data(scores_filename) | |
112 pass | 45 pass |
113 | 46 |
114 | 47 |
115 @cli.command() | |
116 @click.option('--st', 'scores_threshold', type=float, required=True) | |
117 @click.option('--mt', 'matches_threshold', type=float, required=True) | |
118 @click.pass_context | |
119 def get_thresholded_data(ctx, scores_threshold, matches_threshold): | |
120 result = filter_thresholds(ctx.obj['data'], scores_threshold, matches_threshold) | |
121 return result | |
122 | |
123 | |
124 @cli.command() | |
125 @click.option('--k', 'k', type=int, required=True) | |
126 @click.pass_context | |
127 def get_top_k_data(ctx, k): | |
128 result = get_top_k_matches(ctx.obj['data'], k) | |
129 return result | |
130 | |
131 | |
132 @cli.result_callback() | |
133 def write_output(result: DataFrame, scores_filename, output_filename): | |
134 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) | |
135 result.to_csv(output_filename, sep="\t", index=False) | |
136 | |
137 | |
138 if __name__ == '__main__': | 48 if __name__ == '__main__': |
139 cli(obj={}) | 49 cli() |