annotate matchms_split.py @ 0:ea891750acfc draft

planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
author recetox
date Mon, 04 Dec 2023 19:17:25 +0000
parents
children 358a151ab81e
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
1 import argparse
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
2 import itertools
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
3 import os
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
4 from typing import List
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
5
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
6 from matchms.exporting import save_as_msp
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
7 from matchms.importing import load_from_msp
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
8
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
9
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
10 def get_spectra_names(spectra: list) -> List[str]:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
11 """Read the keyword 'compound_name' from a spectra.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
12
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
13 Args:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
14 spectra (list): List of individual spectra.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
15
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
16 Returns:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
17 List[str]: List with 'compoud_name' of individual spectra.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
18 """
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
19 return [x.get("compound_name") for x in spectra]
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
20
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
21
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
22 def make_outdir(outdir: str):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
23 """Create destination directory.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
24
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
25 Args:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
26 outdir (str): Path to destination directory where split spectra files are generated.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
27 """
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
28 return os.mkdir(outdir)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
29
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
30
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
31 def write_spectra(spectra, outdir):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
32 """Generates MSP files of individual spectra.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
33
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
34 Args:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
35 spectra (List[Spectrum]): Spectra to write to file
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
36 outdir (str): Path to destination directory.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
37 """
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
38 names = get_spectra_names(spectra)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
39 for i in range(len(spectra)):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
40 outpath = assemble_outpath(names[i], outdir)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
41 save_as_msp(spectra[i], outpath)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
42
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
43
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
44 def assemble_outpath(name, outdir):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
45 """Filter special chracteres from name.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
46
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
47 Args:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
48 name (str): Name to be filetered.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
49 outdir (str): Path to destination directory.
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
50 """
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
51 filename = ''.join(filter(str.isalnum, name))
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
52 outfile = str(filename) + ".msp"
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
53 outpath = os.path.join(outdir, outfile)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
54 return outpath
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
55
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
56
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
57 def split_round_robin(iterable, num_chunks):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
58 chunks = [list() for _ in range(num_chunks)]
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
59 index = itertools.cycle(range(num_chunks))
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
60 for value in iterable:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
61 chunks[next(index)].append(value)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
62 chunks = filter(lambda x: len(x) > 0, chunks)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
63 return chunks
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
64
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
65
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
66 listarg = argparse.ArgumentParser()
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
67 listarg.add_argument('--filename', type=str)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
68 listarg.add_argument('--method', type=str)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
69 listarg.add_argument('--outdir', type=str)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
70 listarg.add_argument('--parameter', type=int)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
71 args = listarg.parse_args()
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
72 outdir = args.outdir
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
73 filename = args.filename
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
74 method = args.method
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
75 parameter = args.parameter
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
76
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
77
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
78 if __name__ == "__main__":
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
79 spectra = load_from_msp(filename, metadata_harmonization=True)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
80 make_outdir(outdir)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
81
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
82 if method == "one-per-file":
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
83 write_spectra(list(spectra), outdir)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
84 else:
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
85 if method == "chunk-size":
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
86 chunks = iter(lambda: list(itertools.islice(spectra, parameter)), [])
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
87 elif method == "num-chunks":
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
88 chunks = split_round_robin(spectra, parameter)
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
89 for i, x in enumerate(chunks):
ea891750acfc planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit 970c7dc210faacd545c740ddae0e5e78c2cecce4
recetox
parents:
diff changeset
90 save_as_msp(x, os.path.join(outdir, f"chunk_{i}.msp"))