comparison formatter.py @ 4:966b4134ad12 draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 5661cf2406e0616d7b2f4bee1b57ec43716088de
author recetox
date Tue, 18 Oct 2022 11:02:18 +0000
parents 574c6331e9db
children 1b09315a3f87
comparison
equal deleted inserted replaced
3:574c6331e9db 4:966b4134ad12
1 import click 1 import click
2 from pandas import DataFrame, read_csv, to_numeric 2 from matchms.importing import scores_from_json
3 from pandas import DataFrame
3 4
4 5
5 def create_long_table(data: DataFrame, value_id: str) -> DataFrame: 6 def create_long_table(data: DataFrame, value_id: str) -> DataFrame:
6 """Convert the table from compact into long format. 7 """Convert the table from compact into long format.
7 See DataFrame.melt(...). 8 See DataFrame.melt(...).
61 filtered = data[data['score'] > t_score] 62 filtered = data[data['score'] > t_score]
62 filtered = filtered[filtered['matches'] > t_matches] 63 filtered = filtered[filtered['matches'] > t_matches]
63 return filtered 64 return filtered
64 65
65 66
66 def load_data(scores_filename: str, matches_filename: str) -> DataFrame: 67 def scores_to_dataframes(scores):
68 """Unpack scores from matchms.scores into two dataframes of scores and matches.
69
70 Args:
71 scores (matchms.scores): matchms.scores object.
72
73 Returns:
74 DataFrame: Scores
75 DataFrame: Matches
76 """
77 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries]
78 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references]
79
80 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names)
81 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names)
82
83 return dataframe_scores, dataframe_matches
84
85
86 def load_data(scores_filename: str) -> DataFrame:
67 """Load data from filenames and join on compound id. 87 """Load data from filenames and join on compound id.
68 88
69 Args: 89 Args:
70 scores_filename (str): Path to scores table. 90 scores_filename (str): Path to json file with serialized scores.
71 matches_filename (str): Path to matches table.
72 91
73 Returns: 92 Returns:
74 DataFrame: Joined dataframe on compounds containing scores an matches in long format. 93 DataFrame: Joined dataframe on compounds containing scores and matches in long format.
75 """ 94 """
76 matches = read_csv(matches_filename, sep="\t", index_col=0, header=0).apply(to_numeric) 95 scores = scores_from_json(scores_filename)
77 scores = read_csv(scores_filename, sep="\t", index_col=0, header=0).apply(to_numeric) 96 scores, matches = scores_to_dataframes(scores)
78 97
79 scores_long = create_long_table(scores, 'score') 98 scores_long = create_long_table(scores, 'score')
80 matches_long = create_long_table(matches, 'matches') 99 matches_long = create_long_table(matches, 'matches')
81 100
82 combined = join_df(matches_long, scores_long, on=['compound'], how='inner') 101 combined = join_df(matches_long, scores_long, on=['compound'], how='inner')
83 return combined 102 return combined
84 103
85 104
86 @click.group() 105 @click.group()
87 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True) 106 @click.option('--sf', 'scores_filename', type=click.Path(exists=True), required=True)
88 @click.option('--mf', 'matches_filename', type=click.Path(exists=True), required=True)
89 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True) 107 @click.option('--o', 'output_filename', type=click.Path(writable=True), required=True)
90 @click.pass_context 108 @click.pass_context
91 def cli(ctx, scores_filename, matches_filename, output_filename): 109 def cli(ctx, scores_filename, output_filename):
92 ctx.ensure_object(dict) 110 ctx.ensure_object(dict)
93 ctx.obj['data'] = load_data(scores_filename, matches_filename) 111 ctx.obj['data'] = load_data(scores_filename)
94 pass 112 pass
95 113
96 114
97 @cli.command() 115 @cli.command()
98 @click.option('--st', 'scores_threshold', type=float, required=True) 116 @click.option('--st', 'scores_threshold', type=float, required=True)
109 def get_top_k_data(ctx, k): 127 def get_top_k_data(ctx, k):
110 result = get_top_k_matches(ctx.obj['data'], k) 128 result = get_top_k_matches(ctx.obj['data'], k)
111 return result 129 return result
112 130
113 131
114 @cli.resultcallback() 132 @cli.result_callback()
115 def write_output(result: DataFrame, scores_filename, matches_filename, output_filename): 133 def write_output(result: DataFrame, scores_filename, output_filename):
116 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'}) 134 result = result.reset_index().rename(columns={'level_0': 'query', 'compound': 'reference'})
117 result.to_csv(output_filename, sep="\t", index=False) 135 result.to_csv(output_filename, sep="\t", index=False)
118 136
119 137
120 if __name__ == '__main__': 138 if __name__ == '__main__':