Mercurial > repos > recetox > matchms_similarity
annotate matchms_split.py @ 1:872d8040f713 draft default tip
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit b1cc1aebf796f170d93e3dd46ffcdefdc7b8018a
author | recetox |
---|---|
date | Thu, 12 Oct 2023 13:25:30 +0000 |
parents | e5010b19d64d |
children |
rev | line source |
---|---|
0
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
1 import argparse |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
2 import itertools |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
3 import os |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
4 from typing import List |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
5 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
6 from matchms.exporting import save_as_msp |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
7 from matchms.importing import load_from_msp |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
8 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
9 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
10 def get_spectra_names(spectra: list) -> List[str]: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
11 """Read the keyword 'compound_name' from a spectra. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
12 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
13 Args: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
14 spectra (list): List of individual spectra. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
15 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
16 Returns: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
17 List[str]: List with 'compoud_name' of individual spectra. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
18 """ |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
19 return [x.get("compound_name") for x in spectra] |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
20 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
21 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
22 def make_outdir(outdir: str): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
23 """Create destination directory. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
24 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
25 Args: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
26 outdir (str): Path to destination directory where split spectra files are generated. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
27 """ |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
28 return os.mkdir(outdir) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
29 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
30 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
31 def write_spectra(spectra, outdir): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
32 """Generates MSP files of individual spectra. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
33 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
34 Args: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
35 spectra (List[Spectrum]): Spectra to write to file |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
36 outdir (str): Path to destination directory. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
37 """ |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
38 names = get_spectra_names(spectra) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
39 for i in range(len(spectra)): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
40 outpath = assemble_outpath(names[i], outdir) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
41 save_as_msp(spectra[i], outpath) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
42 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
43 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
44 def assemble_outpath(name, outdir): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
45 """Filter special chracteres from name. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
46 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
47 Args: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
48 name (str): Name to be filetered. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
49 outdir (str): Path to destination directory. |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
50 """ |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
51 filename = ''.join(filter(str.isalnum, name)) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
52 outfile = str(filename) + ".msp" |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
53 outpath = os.path.join(outdir, outfile) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
54 return outpath |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
55 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
56 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
57 def split_round_robin(iterable, num_chunks): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
58 chunks = [list() for _ in range(num_chunks)] |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
59 index = itertools.cycle(range(num_chunks)) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
60 for value in iterable: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
61 chunks[next(index)].append(value) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
62 chunks = filter(lambda x: len(x) > 0, chunks) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
63 return chunks |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
64 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
65 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
66 listarg = argparse.ArgumentParser() |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
67 listarg.add_argument('--filename', type=str) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
68 listarg.add_argument('--method', type=str) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
69 listarg.add_argument('--outdir', type=str) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
70 listarg.add_argument('--parameter', type=int) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
71 args = listarg.parse_args() |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
72 outdir = args.outdir |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
73 filename = args.filename |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
74 method = args.method |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
75 parameter = args.parameter |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
76 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
77 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
78 if __name__ == "__main__": |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
79 spectra = load_from_msp(filename, metadata_harmonization=True) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
80 make_outdir(outdir) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
81 |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
82 if method == "one-per-file": |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
83 write_spectra(list(spectra), outdir) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
84 else: |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
85 if method == "chunk-size": |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
86 chunks = iter(lambda: list(itertools.islice(spectra, parameter)), []) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
87 elif method == "num-chunks": |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
88 chunks = split_round_robin(spectra, parameter) |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
89 for i, x in enumerate(chunks): |
e5010b19d64d
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit f79a5b51599254817727bc9028b9797ea994cb4e
recetox
parents:
diff
changeset
|
90 save_as_msp(x, os.path.join(outdir, f"chunk_{i}.msp")) |