Mercurial > repos > recetox > matchms_formatter
diff formatter.py @ 4:966b4134ad12 draft
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 5661cf2406e0616d7b2f4bee1b57ec43716088de
author | recetox |
---|---|
date | Tue, 18 Oct 2022 11:02:18 +0000 |
parents | 574c6331e9db |
children | 1b09315a3f87 |
line wrap: on
line diff
--- a/formatter.py Wed Sep 21 15:29:51 2022 +0000 +++ b/formatter.py Tue Oct 18 11:02:18 2022 +0000 @@ -1,5 +1,6 @@ import click -from pandas import DataFrame, read_csv, to_numeric +from matchms.importing import scores_from_json +from pandas import DataFrame def create_long_table(data: DataFrame, value_id: str) -> DataFrame: @@ -63,18 +64,36 @@ return filtered -def load_data(scores_filename: str, matches_filename: str) -> DataFrame: +def scores_to_dataframes(scores): + """Unpack scores from matchms.scores into two dataframes of scores and matches. + + Args: + scores (matchms.scores): matchms.scores object. + + Returns: + DataFrame: Scores + DataFrame: Matches + """ + query_names = [spectra.metadata['compound_name'] for spectra in scores.queries] + reference_names = [spectra.metadata['compound_name'] for spectra in scores.references] + + dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names) + dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names) + + return dataframe_scores, dataframe_matches + + +def load_data(scores_filename: str) -> DataFrame: """Load data from filenames and join on compound id. Args: - scores_filename (str): Path to scores table. - matches_filename (str): Path to matches table. + scores_filename (str): Path to json file with serialized scores. Returns: - DataFrame: Joined dataframe on compounds containing scores an matches in long format. + DataFrame: Joined dataframe on compounds containing scores and matches in long format. """ - matches = read_csv(matches_filename, sep="\t", index_col=0, header=0).apply(to_numeric) - scores = read_csv(scores_filename, sep="\t", index_col=0, header=0).apply(to_numeric) + scores = scores_from_json(scores_filename) + scores, matches = scores_to_dataframes(scores) scores_long = create_long_table(scores, 'score') matches_long = create_long_table(matches, 'matches') @@ -85,12 +104,11 @@ @click.group() @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) -@click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True) @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) @click.pass_context -def cli(ctx, scores_filename, matches_filename, output_filename): +def cli(ctx, scores_filename, output_filename): ctx.ensure_object(dict) - ctx.obj['data'] = load_data(scores_filename, matches_filename) + ctx.obj['data'] = load_data(scores_filename) pass @@ -111,8 +129,8 @@ return result -@cli.resultcallback() -def write_output(result: DataFrame, scores_filename, matches_filename, output_filename): +@cli.result_callback() +def write_output(result: DataFrame, scores_filename, output_filename): result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) result.to_csv(output_filename, sep="\t", index=False)