comparison matchms_similarity_wrapper.py @ 0:30e680e555d4 draft

"planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 4d2ac914c951166e386a94d8ebb8cb1becfac122"
author recetox
date Tue, 22 Mar 2022 16:07:32 +0000
parents
children f680068b7863
comparison
equal deleted inserted replaced
-1:000000000000 0:30e680e555d4
1 import argparse
2 import sys
3
4 import numpy as np
5 from matchms import calculate_scores
6 from matchms.importing import load_from_mgf, load_from_msp
7 from matchms.similarity import CosineGreedy, CosineHungarian, MetadataMatch, ModifiedCosine
8 from pandas import DataFrame
9
10
11 def convert_precursor_mz(spectrum):
12 """
13 Check the presence of precursor m/z since it is needed for ModifiedCosine similarity metric. Convert to float if
14 needed, raise error if missing.
15 """
16
17 if "precursor_mz" in spectrum.metadata:
18 metadata = spectrum.metadata
19 metadata["precursor_mz"] = float(metadata["precursor_mz"])
20 spectrum.metadata = metadata
21 return spectrum
22 else:
23 raise ValueError("Precursor_mz missing. Apply 'add_precursor_mz' filter first.")
24
25
26 def main(argv):
27 parser = argparse.ArgumentParser(description="Compute MSP similarity scores")
28 parser.add_argument("-r", dest="ri_tolerance", type=float, help="Use RI filtering with given tolerance.")
29 parser.add_argument("-s", dest="symmetric", action='store_true', help="Computation is symmetric.")
30 parser.add_argument("--ref", dest="references_filename", type=str, help="Path to reference spectra library.")
31 parser.add_argument("--ref_format", dest="references_format", type=str, help="Reference spectra library file format.")
32 parser.add_argument("queries_filename", type=str, help="Path to query spectra.")
33 parser.add_argument("queries_format", type=str, help="Query spectra file format.")
34 parser.add_argument("similarity_metric", type=str, help='Metric to use for matching.')
35 parser.add_argument("tolerance", type=float, help="Tolerance to use for peak matching.")
36 parser.add_argument("mz_power", type=float, help="The power to raise mz to in the cosine function.")
37 parser.add_argument("intensity_power", type=float, help="The power to raise intensity to in the cosine function.")
38 parser.add_argument("output_filename_scores", type=str, help="Path where to store the output .tsv scores.")
39 parser.add_argument("output_filename_matches", type=str, help="Path where to store the output .tsv matches.")
40 args = parser.parse_args()
41
42 if args.queries_format == 'msp':
43 queries_spectra = list(load_from_msp(args.queries_filename))
44 elif args.queries_format == 'mgf':
45 queries_spectra = list(load_from_mgf(args.queries_filename))
46 else:
47 raise ValueError(f'File format {args.queries_format} not supported for query spectra.')
48
49 if args.symmetric:
50 reference_spectra = []
51 else:
52 if args.references_format == 'msp':
53 reference_spectra = list(load_from_msp(args.references_filename))
54 elif args.references_format == 'mgf':
55 reference_spectra = list(load_from_mgf(args.references_filename))
56 else:
57 raise ValueError(f'File format {args.references_format} not supported for reference spectra library.')
58
59 if args.similarity_metric == 'CosineGreedy':
60 similarity_metric = CosineGreedy(args.tolerance, args.mz_power, args.intensity_power)
61 elif args.similarity_metric == 'CosineHungarian':
62 similarity_metric = CosineHungarian(args.tolerance, args.mz_power, args.intensity_power)
63 elif args.similarity_metric == 'ModifiedCosine':
64 similarity_metric = ModifiedCosine(args.tolerance, args.mz_power, args.intensity_power)
65 reference_spectra = list(map(convert_precursor_mz, reference_spectra))
66 queries_spectra = list(map(convert_precursor_mz, queries_spectra))
67 else:
68 return -1
69
70 print("Calculating scores...")
71 scores = calculate_scores(
72 references=queries_spectra if args.symmetric else reference_spectra,
73 queries=queries_spectra,
74 similarity_function=similarity_metric,
75 is_symmetric=args.symmetric
76 )
77
78 if args.ri_tolerance is not None:
79 print("RI filtering with tolerance ", args.ri_tolerance)
80 ri_matches = calculate_scores(reference_spectra, queries_spectra, MetadataMatch("retention_index", "difference", args.ri_tolerance)).scores
81 scores.scores["score"] = np.where(ri_matches, scores.scores["score"], 0.0)
82
83 write_outputs(args, scores)
84 return 0
85
86
87 def write_outputs(args, scores):
88 print("Storing outputs...")
89 query_names = [spectra.metadata['compound_name'] for spectra in scores.queries]
90 reference_names = [spectra.metadata['compound_name'] for spectra in scores.references]
91
92 # Write scores to dataframe
93 dataframe_scores = DataFrame(data=[entry["score"] for entry in scores.scores], index=reference_names, columns=query_names)
94 dataframe_scores.to_csv(args.output_filename_scores, sep='\t')
95
96 # Write number of matches to dataframe
97 dataframe_matches = DataFrame(data=[entry["matches"] for entry in scores.scores], index=reference_names, columns=query_names)
98 dataframe_matches.to_csv(args.output_filename_matches, sep='\t')
99
100
101 if __name__ == "__main__":
102 main(argv=sys.argv[1:])
103 pass