Repository 'matchms_similarity'
hg clone https://toolshed.g2.bx.psu.edu/repos/recetox/matchms_similarity

Changeset 1:872d8040f713 (2023-10-12)
Previous changeset 0:e5010b19d64d (2023-06-27)
Commit message:
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/matchms commit b1cc1aebf796f170d93e3dd46ffcdefdc7b8018a
modified:
formatter.py
macros.xml
matchms_filtering_wrapper.py
matchms_similarity.xml
test-data/similarity/scores_test6_out.json
test-data/spectral_similarity/test1.json
added:
test-data/convert/ms_lima_output.msp
test-data/filtering/reduce_to_top_n_peaks.msp
test-data/filtering/require_filter.msp
test-data/filtering/require_inchi_out.msp
test-data/filtering/require_smiles_out.msp
test-data/out_matchms_add_key.msp
b
diff -r e5010b19d64d -r 872d8040f713 formatter.py
--- a/formatter.py Tue Jun 27 14:26:29 2023 +0000
+++ b/formatter.py Thu Oct 12 13:25:30 2023 +0000
[
@@ -13,10 +13,12 @@
         DataFrame: Scores
         DataFrame: Matches
     """
-    dataframe = DataFrame(columns=['query', 'reference', *scores.scores.score_names])
+    data = []
 
     for i, (row, col) in enumerate(zip(scores.scores.row, scores.scores.col)):
-        dataframe.loc[i] = [scores.queries[col].metadata['compound_name'], scores.references[row].metadata['compound_name'], *scores.scores.data[i]]
+        data.append([scores.queries[col].metadata['compound_name'], scores.references[row].metadata['compound_name'], *scores.scores.data[i]])
+
+    dataframe = DataFrame(data, columns=['query', 'reference', *scores.scores.score_names])
 
     return dataframe
 
b
diff -r e5010b19d64d -r 872d8040f713 macros.xml
--- a/macros.xml Tue Jun 27 14:26:29 2023 +0000
+++ b/macros.xml Thu Oct 12 13:25:30 2023 +0000
b
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@TOOL_VERSION@">0.20.0</token>
+    <token name="@TOOL_VERSION@">0.22.0</token>
 
     <xml name="creator">
         <creator>
@@ -57,9 +57,11 @@
 
     <xml name="input_param">
         <conditional name="scores">
-            <param name="use_scores" label="Use Scores Object" type="boolean" truevalue="TRUE" falsevalue="FALSE"
-                   checked="false"/>
-            <when value="TRUE">
+            <param name="use_scores" label="Use Scores Object" type="select">
+                <option value="False" selected="true">FALSE</option>
+                <option value="True">TRUE</option>
+            </param>
+            <when value="True">
                 <param label="Scores object" name="scores_in" type="data" format="json"
                     help="Scores objects calculated previously using one of the matchms similarity tools." />
                 <param label="join type" name="join_type" type="select" display="radio"
@@ -70,7 +72,7 @@
                     <option value="outer">outer</option>
                 </param>
             </when>
-            <when value="FALSE">
+            <when value="False">
                 <param label="Queries spectra" name="queries" type="data" format="msp"
                     help="Query mass spectra to match against references."/>
                 <param label="Reference spectra" name="references" type="data" format="msp"
@@ -89,7 +91,7 @@
 <token name="@init_scores@">
 from matchms.importing import load_from_msp, scores_from_json
 from matchms import Scores
-#if $scores.use_scores
+#if $scores.use_scores == "True"
 scores = scores_from_json("$scores_in")
 join_type = "$scores.join_type"
 #else
b
diff -r e5010b19d64d -r 872d8040f713 matchms_filtering_wrapper.py
--- a/matchms_filtering_wrapper.py Tue Jun 27 14:26:29 2023 +0000
+++ b/matchms_filtering_wrapper.py Thu Oct 12 13:25:30 2023 +0000
b
@@ -4,10 +4,18 @@
 from matchms.exporting import save_as_mgf, save_as_msp
 from matchms.filtering import add_compound_name, add_fingerprint, add_losses, add_parent_mass, add_precursor_mz,\
     add_retention_index, add_retention_time, clean_compound_name
-from matchms.filtering import default_filters, normalize_intensities, select_by_mz, select_by_relative_intensity
+from matchms.filtering import default_filters, normalize_intensities, reduce_to_number_of_peaks, select_by_mz, \
+    select_by_relative_intensity
 from matchms.importing import load_from_mgf, load_from_msp
 
 
+def require_key(spectrum, key):
+    if spectrum.get(key):
+        return spectrum
+
+    return None
+
+
 def main(argv):
     parser = argparse.ArgumentParser(description="Compute MSP similarity scores")
     parser.add_argument("--spectra", type=str, required=True, help="Mass spectra file to be filtered.")
@@ -27,13 +35,23 @@
                         help="Keep only peaks between set m/z range (keep if to_mz >= m/z >= from_mz).")
     parser.add_argument("--from_mz", type=float, help="Lower bound for m/z  filter")
     parser.add_argument("--to_mz", type=float, help="Upper bound for m/z  filter")
+    parser.add_argument("-require_smiles", action='store_true',
+                        help="Remove spectra that does not contain SMILES.")
+    parser.add_argument("-require_inchi", action='store_true',
+                        help="Remove spectra that does not contain INCHI.")
+    parser.add_argument("-reduce_to_top_n_peaks", action='store_true',
+                        help="reduce to top n peaks filter.")
+    parser.add_argument("--n_max", type=int, help="Maximum number of peaks. Remove peaks if more peaks are found.")
     args = parser.parse_args()
 
     if not (args.normalise_intensities
             or args.default_filters
             or args.clean_metadata
             or args.relative_intensity
-            or args.mz_range):
+            or args.mz_range
+            or args.require_smiles
+            or args.require_inchi
+            or args.reduce_to_top_n_peaks):
         raise ValueError('No filter selected.')
 
     if args.spectra_format == 'msp':
@@ -63,7 +81,17 @@
         if args.mz_range:
             spectrum = select_by_mz(spectrum, args.from_mz, args.to_mz)
 
-        filtered_spectra.append(spectrum)
+        if args.reduce_to_top_n_peaks:
+            spectrum = reduce_to_number_of_peaks(spectrum_in=spectrum, n_max=args.n_max)
+
+        if args.require_smiles and spectrum is not None:
+            spectrum = require_key(spectrum, "smiles")
+
+        if args.require_inchi and spectrum is not None:
+            spectrum = require_key(spectrum, "inchi")
+
+        if spectrum is not None:
+            filtered_spectra.append(spectrum)
 
     if args.spectra_format == 'msp':
         save_as_msp(filtered_spectra, args.output)
b
diff -r e5010b19d64d -r 872d8040f713 matchms_similarity.xml
--- a/matchms_similarity.xml Tue Jun 27 14:26:29 2023 +0000
+++ b/matchms_similarity.xml Thu Oct 12 13:25:30 2023 +0000
[
@@ -1,4 +1,4 @@
-<tool id="matchms_similarity" name="matchms similarity" version="@TOOL_VERSION@+galaxy0" profile="21.09">
+<tool id="matchms_similarity" name="matchms similarity" version="@TOOL_VERSION@+galaxy2" profile="21.09">
     <description>calculate the similarity score and matched peaks</description>
 
     <macros>
@@ -13,21 +13,21 @@
         <requirement type="package" version="@TOOL_VERSION@">matchms</requirement>
     </requirements>
 
-    <environment_variables>
-        <environment_variable name="MPLCONFIGDIR">\$_GALAXY_JOB_TMP_DIR</environment_variable>
-    </environment_variables>
-
     <command detect_errors="exit_code"><![CDATA[
         sh ${matchms_python_cli}
     ]]> </command>
 
+    <environment_variables>
+        <environment_variable name="MPLCONFIGDIR">\$_GALAXY_JOB_TMP_DIR</environment_variable>
+    </environment_variables>
+
     <configfiles>
         <configfile name="matchms_python_cli">
             python3 ${__tool_directory__}/matchms_similarity_wrapper.py \
-            #if $ri_filtering.is_true
+            #if $ri_filtering.is_true == "TRUE"
             -r $ri_filtering.tolerance \
             #end if
-            #if $symmetric.is_symmetric
+            #if $symmetric.is_symmetric == "TRUE"
             -s \
             #else
             --ref "$references" \
@@ -58,12 +58,15 @@
         <param label="Queries spectra" name="queries" type="data" format="msp,mgf"
                help="Query mass spectra to match against references."/>
         <conditional name="symmetric">
-            <param name="is_symmetric" label="Symmetric" type="boolean" truevalue="TRUE" falsevalue="FALSE"
-                   checked="false"/>
+            <param name="is_symmetric" label="Symmetric" type="select">
+                <option value="FALSE" selected="true">FALSE</option>
+                <option value="TRUE">TRUE</option>
+            </param>
             <when value="FALSE">
                 <param label="Reference spectra" name="references" type="data" format="msp,mgf"
                        help="Reference mass spectra to match against as library."/>
             </when>
+            <when value="TRUE"></when>
         </conditional>
         <param label="Scores array type" name="array_type" type="select" display="radio"
                help="Matrix type for storing scores objects. Sparse type more memory-efficient and better for large arrays.
@@ -107,12 +110,15 @@
 
 
         <conditional name="ri_filtering">
-            <param name="is_true" label="Apply RI filtering" type="boolean" truevalue="TRUE" falsevalue="FALSE"
-                   checked="false"/>
+            <param name="is_true" label="Apply RI filtering" type="select">
+                <option value="FALSE" selected="true">FALSE</option>
+                <option value="TRUE">TRUE</option>
+            </param>
             <when value="TRUE">
                 <param label="tolerance" name="tolerance" type="float" value="60"
                        help="Peaks will be considered a match when less than tolerance apart."/>
             </when>
+            <when value="FALSE"></when>
         </conditional>
     </inputs>
 
@@ -141,7 +147,7 @@
             <param name="references" value="similarity/fill.mgf" ftype="mgf"/>
             <param name="queries" value="similarity/fill2.msp" ftype="msp"/>
             <conditional name="ri_filtering">
-                <param name="is_true" value="True"></param>
+                <param name="is_true" value="TRUE"></param>
                 <param name="tolerance" value="60.0" />
             </conditional>
             <conditional name="metric">
@@ -164,7 +170,7 @@
             </conditional>
             <param name="is_symmetric" value="TRUE" />
             <conditional name="ri_filtering">
-                <param name="is_true" value="True"></param>
+                <param name="is_true" value="TRUE"></param>
                 <param name="tolerance" value="60.0" />
             </conditional>
             <output name="similarity_scores" file="similarity/scores_test5_out.json" ftype="json"/>
@@ -178,7 +184,7 @@
                 <param name="model_weights" value="similarity/spec2vec/weights_100.binary" ftype="auto"/>
                 <param name="allow_missing_percentage" value="1.0"/>
             </conditional>
-            <output name="similarity_scores" file="similarity/scores_test6_out.json" ftype="json" compare="sim_size" delta="100000"/>
+            <output name="similarity_scores" file="similarity/scores_test6_out.json" ftype="json" compare="sim_size" delta="1000"/>
         </test>
     </tests>
 
b
diff -r e5010b19d64d -r 872d8040f713 test-data/convert/ms_lima_output.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/convert/ms_lima_output.msp Thu Oct 12 13:25:30 2023 +0000
[
b'@@ -0,0 +1,4848 @@\n+FORMULA: C4H10NO3PS\n+INCHIKEY: YASYVMFAVPKPKE-SECBINFHSA-N\n+SMILES: COP(=O)(N=C(O)C)SC\n+NAME: Acephate\n+RETENTIONTIME: 1.232997\n+PRECURSORMZ: 184.0194\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 16\n+90.09368 1128.0\n+93.11512 1241.0\n+95.10279 1118.0\n+101.31465 1152.0\n+102.90688 1322.0\n+103.98039 1201.0\n+112.01607 12289.0\n+112.99994 38027.0\n+115.00399 1634.0\n+124.98121 922.0\n+128.97701 9208.0\n+132.57193 1350.0\n+135.84808 1428.0\n+142.99275 16419.0\n+147.94205 1750.0\n+173.5094 2353.0\n+\n+FORMULA: C12H11NO2\n+INCHIKEY: CVXBEEMKQHEXEN-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1cccc2c1cccc2)O\n+NAME: Carbaryl\n+RETENTIONTIME: 5.259445\n+PRECURSORMZ: 202.0863\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 1\n+145.06491 1326147.0\n+\n+FORMULA: C8H16NO5P\n+INCHIKEY: VEENJGZXVHKXNB-UHFFFAOYSA-N\n+SMILES: COP(=O)(OC(=CC(=O)N(C)C)C)OC\n+NAME: Dicrotophos\n+RETENTIONTIME: 2.025499\n+PRECURSORMZ: 238.0844\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+112.074 102027.0\n+112.07591 9070987.0\n+127.01563 3230337.0\n+193.02605 7897744.0\n+238.08437 2973124.0\n+\n+FORMULA: C5H12NO3PS2\n+INCHIKEY: MCWXGJITAZMZEV-UHFFFAOYSA-N\n+SMILES: CN=C(CSP(=S)(OC)OC)O\n+NAME: Dimethoate\n+RETENTIONTIME: 2.866696\n+PRECURSORMZ: 230.0072\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 8\n+88.0219 548446.0\n+124.98233 183861.0\n+142.99275 722053.0\n+156.95422 80792.0\n+170.97 1426256.0\n+197.98123 240915.0\n+198.96501 5415933.0\n+230.00722 497851.0\n+\n+FORMULA: C21H22NO4Cl\n+INCHIKEY: QNBTYORWCCMPQP-UHFFFAOYSA-N\n+SMILES: COc1cc(ccc1OC)C(=CC(=O)N1CCOCC1)c1ccc(cc1)Cl\n+NAME: Dimethomorph\n+RETENTIONTIME: 7.060486\n+PRECURSORMZ: 388.1316\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 22\n+114.05532 468862.0\n+125.01571 886745.0\n+138.99484 4138370.0\n+155.0705 425164.0\n+165.05519 15513399.0\n+165.06543 350695.0\n+195.08057 386226.0\n+215.0262 490061.0\n+223.07544 702025.0\n+227.02576 230514.0\n+229.04225 216308.0\n+235.07555 241142.0\n+238.09914 1323577.0\n+242.04929 2449236.0\n+243.02142 891584.0\n+257.03726 578874.0\n+258.04443 3232295.0\n+266.0943 358273.0\n+270.04492 608851.0\n+273.06772 3866006.0\n+286.03912 483547.0\n+301.06311 4060551.0\n+\n+FORMULA: C2H8NO2PS\n+INCHIKEY: NNKVPIKMPCQWCG-ZCFIWIBFSA-N\n+SMILES: COP(=O)(SC)N\n+NAME: Methamidophos\n+RETENTIONTIME: 1.153307\n+PRECURSORMZ: 142.0089\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 4\n+98.00042 37721.0\n+109.98272 71172.0\n+112.01607 2867923.0\n+127.99321 75837.0\n+\n+FORMULA: C7H13O6P\n+INCHIKEY: GEPDYQSQVLXLEU-UHFFFAOYSA-N\n+SMILES: COC(=O)C=C(OP(=O)(OC)OC)C\n+NAME: Mevinphos\n+RETENTIONTIME: 2.876307\n+PRECURSORMZ: 225.0525\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 4\n+99.04416 295529.0\n+127.01563 1960973.0\n+193.02605 1150190.0\n+225.05209 101872.0\n+\n+FORMULA: C5H12NO4PS\n+INCHIKEY: PZXOQEXFMJCDPG-UHFFFAOYSA-N\n+SMILES: CN=C(CSP(=O)(OC)OC)O\n+NAME: Omethoate\n+RETENTIONTIME: 1.33423\n+PRECURSORMZ: 214.0303\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+104.01654 86844.0\n+124.98233 194375.0\n+127.01563 4696021.0\n+128.97701 47970.0\n+142.99275 4310988.0\n+\n+FORMULA: C16H20O6P2S3\n+INCHIKEY: WWJZWCUNLNYYAU-UHFFFAOYSA-N\n+SMILES: COP(=S)(Oc1ccc(cc1)Sc1ccc(cc1)OP(=S)(OC)OC)OC\n+NAME: Temephos\n+RETENTIONTIME: 7.736881\n+PRECURSORMZ: 466.9978\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 44\n+124.98233 218400.0\n+125.00596 124192.0\n+127.01563 590561.0\n+139.02167 79978.0\n+139.05467 105470.0\n+140.95975 428071.0\n+142.99275 7482486.0\n+154.99849 619650.0\n+157.00861 365474.0\n+171.02641 502869.0\n+172.03448 151150.0\n+183.02695 176056.0\n+184.03453 206568.0\n+187.02121 240339.0\n+199.02151 245544.0\n+200.02902 385101.0\n+201.03729 198527.0\n+211.03268 88063.0\n+215.01689 538632.0\n+217.03214 259530.0\n+218.98798 87371.0\n+219.02972 94609.0\n+230.99336 108101.0\n+232.03233 244260.0\n+233.00958 88058.0\n+247.02538 224924.0\n+248.03291 127038.0\n+261.98486 132283.0\n+262.99268 185876.0\n+264.00052 186556.0\n+278.98856 20'..b'1 1076938.0\n+214.04874 274804.0\n+218.05339 138241.0\n+223.04858 173264.0\n+225.06474 1428863.0\n+238.05968 4320120.0\n+239.08025 155000.0\n+247.08057 319312.0\n+253.14586 318558.0\n+255.08643 272181.0\n+267.08603 1563035.0\n+270.13541 250539.0\n+281.12677 392614.0\n+295.14307 440522.0\n+298.08517 500719.0\n+298.15424 170519.0\n+299.09323 317366.0\n+307.14276 192450.0\n+323.14941 13337730.0\n+328.07541 186287.0\n+334.15344 701456.0\n+348.08228 404641.0\n+353.0658 262110.0\n+366.07346 193709.0\n+368.08932 5815862.0\n+495.20059 2433116.0\n+\n+FORMULA: C11H16N2O2\n+INCHIKEY: IMIDOCRTMDIQIJ-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1ccc(c(c1)C)N(C)C)O\n+NAME: Aminocarb_1\n+RETENTIONTIME: 0.8035756\n+PRECURSORMZ: 209.129\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+120.05733 176701.0\n+122.06016 1917070.0\n+136.07611 928093.0\n+137.08363 8823033.0\n+152.10725 186336.0\n+\n+FORMULA: C11H16N2O2\n+INCHIKEY: IMIDOCRTMDIQIJ-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1ccc(c(c1)C)N(C)C)O\n+NAME: Aminocarb_2\n+RETENTIONTIME: 1.13997\n+PRECURSORMZ: 209.129\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+120.05733 247123.0\n+122.06016 2666029.0\n+136.07611 1253139.0\n+137.08363 12201258.0\n+152.10725 242082.0\n+\n+FORMULA: C9H20N2O2\n+INCHIKEY: WZZLDXDUQPOXNW-UHFFFAOYSA-N\n+SMILES: CCCOC(=NCCCN(C)C)O\n+NAME: Propamocarb_1\n+RETENTIONTIME: 0.7535679\n+PRECURSORMZ: 189.1603\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 2\n+86.0966 201548.0\n+102.05516 5038638.0\n+\n+FORMULA: C9H20N2O2\n+INCHIKEY: WZZLDXDUQPOXNW-UHFFFAOYSA-N\n+SMILES: CCCOC(=NCCCN(C)C)O\n+NAME: Propamocarb_2\n+RETENTIONTIME: 1.081971\n+PRECURSORMZ: 189.1603\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 2\n+86.0966 107829.0\n+102.05516 2507023.0\n+\n+FORMULA: C11H15N3O2\n+INCHIKEY: MYPKGPZHHQEODQ-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1cccc(c1)N=CN(C)C)O\n+NAME: Formetanate_1\n+RETENTIONTIME: 0.7730471\n+PRECURSORMZ: 222.1239\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 13\n+93.03365 1796.0\n+107.04935 1981.0\n+111.04435 82262.0\n+118.04142 1927.0\n+120.04462 150907.0\n+121.03984 67610.0\n+122.06016 5909.0\n+122.75254 1678.0\n+150.98424 1930.0\n+165.1024 143887.0\n+173.50876 2616.0\n+200.05632 2056.0\n+208.52768 2170.0\n+\n+FORMULA: C11H15N3O2\n+INCHIKEY: MYPKGPZHHQEODQ-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1cccc(c1)N=CN(C)C)O\n+NAME: Formetanate_2\n+RETENTIONTIME: 1.13043\n+PRECURSORMZ: 222.1239\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 15\n+91.05441 6330.0\n+93.03365 27201.0\n+107.04935 4024.0\n+111.04435 131558.0\n+115.05429 3711.0\n+117.06996 5571.0\n+118.04177 4476.0\n+120.04462 274740.0\n+121.03984 113412.0\n+122.06016 7843.0\n+124.07605 4049.0\n+135.04427 4178.0\n+145.06488 3067.0\n+164.95049 3848.0\n+165.1024 263802.0\n+\n+FORMULA: C12H18N2O2\n+INCHIKEY: YNEVBPNZHBAYOA-UHFFFAOYSA-N\n+SMILES: CN=C(Oc1cc(C)c(c(c1)C)N(C)C)O\n+NAME: Mexacarbate\n+RETENTIONTIME: 1.682191\n+PRECURSORMZ: 223.1443\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+134.07283 2632951.0\n+136.07611 26036728.0\n+150.092 1572118.0\n+151.09932 54847764.0\n+166.12282 1541928.0\n+\n+FORMULA: C19H21N2OCl\n+INCHIKEY: OGYFATSSENRIKG-UHFFFAOYSA-N\n+SMILES: Clc1ccc(cc1)CN(C(=Nc1ccccc1)O)C1CCCC1\n+NAME: Monceren\n+RETENTIONTIME: 7.14553\n+PRECURSORMZ: 329.1426\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 5\n+89.03881 550831.0\n+94.06543 635265.0\n+106.06545 446416.0\n+125.01307 512150.0\n+125.01532 37442116.0\n+\n+FORMULA: C16H16N2O4\n+INCHIKEY: WZJZMXBKUWKXTQ-UHFFFAOYSA-N\n+SMILES: CCOC(=Nc1cccc(c1)OC(=Nc1ccccc1)O)O\n+NAME: Desmedipham\n+RETENTIONTIME: 6.430396\n+PRECURSORMZ: 301.1192\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 3\n+136.03947 1773399.0\n+154.04993 1002798.0\n+182.08162 6480130.0\n+\n+FORMULA: C16H16N2O4\n+INCHIKEY: IDOWTHOLJBTAFI-UHFFFAOYSA-N\n+SMILES: COC(=Nc1cccc(c1)OC(=Nc1cccc(c1)C)O)O\n+NAME: Phenmedipham\n+RETENTIONTIME: 6.570995\n+PRECURSORMZ: 301.1185\n+PRECURSORTYPE: [M+H]+\n+INSTRUMENTTYPE: LC-ESI-Orbitrap\n+NUM PEAKS: 2\n+136.03947 2596929.0\n+168.06587 7038054.0\n+\n'
b
diff -r e5010b19d64d -r 872d8040f713 test-data/filtering/reduce_to_top_n_peaks.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtering/reduce_to_top_n_peaks.msp Thu Oct 12 13:25:30 2023 +0000
b
@@ -0,0 +1,57 @@
+IONMODE: negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C001
+RETENTION_TIME: 38.74
+RETENTION_INDEX: None
+NUM PEAKS: 5
+175.0641    26780143.0
+206.9034    26130980.0
+216.9205    32607700.0
+254.8252    23747536.0
+256.8215    31377637.0
+
+IONMODE: negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C002
+RETENTION_TIME: 520.25
+RETENTION_INDEX: 1234.5
+NUM PEAKS: 5
+310.1623    295359836.0
+525.375     1073323842.0
+526.3783    181668883.0
+551.3321    111616808.0
+1047.7378   150394804.0
+
+IONMODE: negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C003
+RETENTION_TIME: 483.67
+NUM PEAKS: 5
+288.6414    202172046.0
+522.3565    4089569222.0
+523.354     1201714423.0
+1043.7028   144351468.0
+1044.7068   83271854.0
+
+IONMODE: negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C004
+RETENTION_TIME: 473.48
+NUM PEAKS: 5
+496.34      12577588056.0
+497.3442    3337125302.0
+498.3462    532285213.0
+991.6726    1420557258.0
+992.6749    763118028.0
+
+IONMODE: negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C005
+RETENTION_TIME: 41.72
+NUM PEAKS: 5
+218.1386    14009249.0
+337.0623    88672453.0
+353.0361    37061354.0
+359.0443    48435582.0
+375.018     29159485.0
+
b
diff -r e5010b19d64d -r 872d8040f713 test-data/filtering/require_filter.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtering/require_filter.msp Thu Oct 12 13:25:30 2023 +0000
[
@@ -0,0 +1,72 @@
+FORMULA: C13H9ClFeO4Si
+CASNO: 2000570-99-8
+ID: 2011
+COMMENT: SpectrumID: 1519953; Source: C4-1998-38-3; Class: Benzenoids; CASRN not real!
+COMPOUND_NAME: ((.eta.5-Cyclopentadienylironbiscarbonyl)(1,2-phenylenedioxysilyl)chloride complex
+PARENT_MASS: 347.930801
+PUBCHEMID: 10970124
+NOMINAL_MASS: 348
+SMILES: Cl[Si]1Oc2ccccc2O1.[C-]#[O+].[C-]#[O+].[CH]1C=CC=C1.[Fe]
+NUM PEAKS: 3
+292.0       999.0
+314.0       118.89
+348.0       734.24
+
+FORMULA: C13H14O
+CASNO: 2000130-22-2
+ID: 7198
+COMMENT: SpectrumID: 1752764; Source: A1-13-956/SMS7-13; DOI: 10.1021/ol1029996; QI: 383; Class: Benzene and substituted derivatives; CASRN not real! |RI:1588|
+COMPOUND_NAME: ((1R*,2R*)-1-Methyl-2-phenylethynylcyclopropyl)methanol
+PARENT_MASS: 186.1044655
+RETENTION_INDEX: 1588.0
+PUBCHEMID: 130762197
+NOMINAL_MASS: 186
+INCHI: InChI=1S/C13H14O/c1-13(10-14)9-12(13)8-7-11-5-3-2-4-6-11/h2-6,12,14H,9-10H2,1H3/t12-,13-/m0/s1
+NUM PEAKS: 20
+51.0        89.92
+63.0        89.92
+77.0        179.84
+88.0        39.96
+89.0        59.95
+91.0        49.95
+102.0       149.86
+113.0       49.95
+115.0       229.79
+127.0       139.87
+128.0       999.0
+129.0       199.82
+144.0       99.91
+155.0       119.89
+156.0       14.89
+157.0       1.1
+158.0       0.1
+186.0       39.96
+187.0       5.89
+188.0       0.5
+
+FORMULA: C34H54O4
+CASNO: 2000774-54-3
+ID: 36905
+COMMENT: SpectrumID: 1800193; Source: PA-7-239-4(DIP); DOI: 10.1002_(SICI)1099-1565(199605)7_3_136; Class: Triterpenoids; CASRN not real! |RI:3353|
+COMPOUND_NAME: ((1R,3aS,5aR,5bR,7aR,9S,11aR,11bR,13aR,13bR)-9-acetoxy-5a,5b,8,8,11a-pentamethyl-1-(prop-1-en-2-yl)icosahydro-1H-cyclopenta[a]chrysen-3a-yl)methyl acetate
+PARENT_MASS: 526.40221
+RETENTION_INDEX: 3353.0
+PUBCHEMID: 236415
+NOMINAL_MASS: 526
+INCHI: InChI=1S/C34H54O4/c1-21(2)24-12-17-34(20-37-22(3)35)19-18-32(8)25(29(24)34)10-11-27-31(7)15-14-28(38-23(4)36)30(5,6)26(31)13-16-33(27,32)9/h24-29H,1,10-20H2,2-9H3/t24-,25+,26-,27+,28-,29+,31-,32+,33+,34+/m0/s1
+SMILES: C=C(C)[C@@H]1CC[C@]2(COC(C)=O)CC[C@]3(C)[C@H](CC[C@@H]4[C@@]5(C)CC[C@H](OC(C)=O)C(C)(C)[C@@H]5CC[C@]43C)[C@@H]12
+NUM PEAKS: 14
+189.0       419.62
+203.0       249.77
+216.0       149.86
+262.0       79.93
+276.0       49.95
+393.0       149.86
+423.0       219.8
+453.0       179.84
+466.0       999.0
+526.0       179.84
+527.0       68.94
+528.0       14.29
+529.0       2.1
+530.0       0.2
\ No newline at end of file
b
diff -r e5010b19d64d -r 872d8040f713 test-data/filtering/require_inchi_out.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtering/require_inchi_out.msp Thu Oct 12 13:25:30 2023 +0000
[
@@ -0,0 +1,59 @@
+FORMULA: C13H14O
+CASNO: 2000130-22-2
+ID: 7198
+COMMENT: SpectrumID: 1752764; Source: A1-13-956/SMS7-13; DOI: 10.1021/ol1029996; QI: 383; Class: Benzene and substituted derivatives; CASRN not real! |RI:1588|
+COMPOUND_NAME: ((1R*,2R*)-1-Methyl-2-phenylethynylcyclopropyl)methanol
+PARENT_MASS: 186.1044655
+RETENTION_INDEX: 1588.0
+PUBCHEMID: 130762197
+NOMINAL_MASS: 186
+INCHI: InChI=1S/C13H14O/c1-13(10-14)9-12(13)8-7-11-5-3-2-4-6-11/h2-6,12,14H,9-10H2,1H3/t12-,13-/m0/s1
+NUM PEAKS: 20
+51.0        89.92
+63.0        89.92
+77.0        179.84
+88.0        39.96
+89.0        59.95
+91.0        49.95
+102.0       149.86
+113.0       49.95
+115.0       229.79
+127.0       139.87
+128.0       999.0
+129.0       199.82
+144.0       99.91
+155.0       119.89
+156.0       14.89
+157.0       1.1
+158.0       0.1
+186.0       39.96
+187.0       5.89
+188.0       0.5
+
+FORMULA: C34H54O4
+CASNO: 2000774-54-3
+ID: 36905
+COMMENT: SpectrumID: 1800193; Source: PA-7-239-4(DIP); DOI: 10.1002_(SICI)1099-1565(199605)7_3_136; Class: Triterpenoids; CASRN not real! |RI:3353|
+COMPOUND_NAME: ((1R,3aS,5aR,5bR,7aR,9S,11aR,11bR,13aR,13bR)-9-acetoxy-5a,5b,8,8,11a-pentamethyl-1-(prop-1-en-2-yl)icosahydro-1H-cyclopenta[a]chrysen-3a-yl)methyl acetate
+PARENT_MASS: 526.40221
+RETENTION_INDEX: 3353.0
+PUBCHEMID: 236415
+NOMINAL_MASS: 526
+INCHI: InChI=1S/C34H54O4/c1-21(2)24-12-17-34(20-37-22(3)35)19-18-32(8)25(29(24)34)10-11-27-31(7)15-14-28(38-23(4)36)30(5,6)26(31)13-16-33(27,32)9/h24-29H,1,10-20H2,2-9H3/t24-,25+,26-,27+,28-,29+,31-,32+,33+,34+/m0/s1
+SMILES: C=C(C)[C@@H]1CC[C@]2(COC(C)=O)CC[C@]3(C)[C@H](CC[C@@H]4[C@@]5(C)CC[C@H](OC(C)=O)C(C)(C)[C@@H]5CC[C@]43C)[C@@H]12
+NUM PEAKS: 14
+189.0       419.62
+203.0       249.77
+216.0       149.86
+262.0       79.93
+276.0       49.95
+393.0       149.86
+423.0       219.8
+453.0       179.84
+466.0       999.0
+526.0       179.84
+527.0       68.94
+528.0       14.29
+529.0       2.1
+530.0       0.2
+
b
diff -r e5010b19d64d -r 872d8040f713 test-data/filtering/require_smiles_out.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/filtering/require_smiles_out.msp Thu Oct 12 13:25:30 2023 +0000
[
@@ -0,0 +1,41 @@
+FORMULA: C13H9ClFeO4Si
+CASNO: 2000570-99-8
+ID: 2011
+COMMENT: SpectrumID: 1519953; Source: C4-1998-38-3; Class: Benzenoids; CASRN not real!
+COMPOUND_NAME: ((.eta.5-Cyclopentadienylironbiscarbonyl)(1,2-phenylenedioxysilyl)chloride complex
+PARENT_MASS: 347.930801
+PUBCHEMID: 10970124
+NOMINAL_MASS: 348
+SMILES: Cl[Si]1Oc2ccccc2O1.[C-]#[O+].[C-]#[O+].[CH]1C=CC=C1.[Fe]
+NUM PEAKS: 3
+292.0       999.0
+314.0       118.89
+348.0       734.24
+
+FORMULA: C34H54O4
+CASNO: 2000774-54-3
+ID: 36905
+COMMENT: SpectrumID: 1800193; Source: PA-7-239-4(DIP); DOI: 10.1002_(SICI)1099-1565(199605)7_3_136; Class: Triterpenoids; CASRN not real! |RI:3353|
+COMPOUND_NAME: ((1R,3aS,5aR,5bR,7aR,9S,11aR,11bR,13aR,13bR)-9-acetoxy-5a,5b,8,8,11a-pentamethyl-1-(prop-1-en-2-yl)icosahydro-1H-cyclopenta[a]chrysen-3a-yl)methyl acetate
+PARENT_MASS: 526.40221
+RETENTION_INDEX: 3353.0
+PUBCHEMID: 236415
+NOMINAL_MASS: 526
+INCHI: InChI=1S/C34H54O4/c1-21(2)24-12-17-34(20-37-22(3)35)19-18-32(8)25(29(24)34)10-11-27-31(7)15-14-28(38-23(4)36)30(5,6)26(31)13-16-33(27,32)9/h24-29H,1,10-20H2,2-9H3/t24-,25+,26-,27+,28-,29+,31-,32+,33+,34+/m0/s1
+SMILES: C=C(C)[C@@H]1CC[C@]2(COC(C)=O)CC[C@]3(C)[C@H](CC[C@@H]4[C@@]5(C)CC[C@H](OC(C)=O)C(C)(C)[C@@H]5CC[C@]43C)[C@@H]12
+NUM PEAKS: 14
+189.0       419.62
+203.0       249.77
+216.0       149.86
+262.0       79.93
+276.0       49.95
+393.0       149.86
+423.0       219.8
+453.0       179.84
+466.0       999.0
+526.0       179.84
+527.0       68.94
+528.0       14.29
+529.0       2.1
+530.0       0.2
+
b
diff -r e5010b19d64d -r 872d8040f713 test-data/out_matchms_add_key.msp
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/out_matchms_add_key.msp Thu Oct 12 13:25:30 2023 +0000
b
@@ -0,0 +1,199 @@
+IONMODE: Negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C001
+RETENTION_TIME: 38.74
+RETENTION_INDEX: -1
+TOOL_USED: matchms
+NUM PEAKS: 57
+138.9121    10186226.0
+148.9337    1008656.0
+175.0641    26780143.0
+186.1095    2675456.0
+196.8658    21390430.0
+198.8647    21688594.0
+200.8848    7742528.0
+206.9034    26130980.0
+216.9205    32607700.0
+234.0134    2550129.0
+254.8252    23747536.0
+256.8215    31377637.0
+258.8237    15532799.0
+266.8652    9805546.0
+268.8537    3090354.0
+306.9914    3169316.0
+312.7841    10051801.0
+316.7777    10734168.0
+322.8157    6317648.0
+324.9549    8619910.0
+334.849     4178412.0
+342.8093    3285552.0
+349.9455    2050695.0
+350.9875    6150799.0
+351.941     1965882.0
+366.8281    3253770.0
+370.7418    9765463.0
+372.7383    19374863.0
+382.8218    12815572.0
+384.8177    8311500.0
+392.7685    10913351.0
+413.2664    3965867.0
+426.7772    5431633.0
+428.7834    8554675.0
+434.7287    9943329.0
+436.8161    3705247.0
+440.7322    10603010.0
+442.7401    8271752.0
+450.7016    8762673.0
+460.7076    4528973.0
+462.7862    2123666.0
+484.7242    4273989.0
+486.7743    4886062.0
+488.6825    12267966.0
+492.744     7662344.0
+494.8953    7188793.0
+498.8794    6811405.0
+500.8484    6520691.0
+502.7832    3567833.0
+510.763     4989757.0
+518.7415    4243468.0
+546.6093    7177067.0
+550.6949    6104789.0
+566.5977    5171811.0
+612.6927    2005587.0
+676.6436    1982714.0
+800.4451    2792137.0
+
+IONMODE: Negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C002
+RETENTION_TIME: 520.25
+RETENTION_INDEX: 1234.5
+TOOL_USED: matchms
+NUM PEAKS: 35
+131.1733    1971789.0
+267.2688    6103973.0
+279.0196    1946255.0
+289.6491    46498377.0
+301.1565    15185412.0
+309.1649    18045974.0
+310.1623    295359836.0
+311.1658    13124727.0
+312.0296    38757284.0
+330.6757    12666597.0
+525.375     1073323842.0
+526.3783    181668883.0
+527.3812    23642795.0
+551.3321    111616808.0
+552.3348    28340614.0
+553.3314    2609936.0
+562.3269    7538206.0
+578.2905    7578406.0
+619.3008    4742103.0
+624.296     11790213.0
+813.5403    25060147.0
+814.5336    5865975.0
+955.1171    2322927.0
+1047.7378   150394804.0
+1048.7399   90978863.0
+1049.7432   29946438.0
+1050.7453   6807767.0
+1069.7158   5074652.0
+1074.1979   3402288.0
+1075.1968   33352763.0
+1076.2004   10417953.0
+1101.6535   2023916.0
+1206.3127   3738816.0
+1216.8041   4439324.0
+1217.807    3565334.0
+
+IONMODE: Negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C003
+RETENTION_TIME: 483.67
+TOOL_USED: matchms
+NUM PEAKS: 26
+265.2529    11366224.0
+266.2564    1420444.0
+279.6362    29849749.0
+280.6546    8848921.0
+288.6414    202172046.0
+378.2093    15309961.0
+379.1966    2902366.0
+522.3565    4089569222.0
+523.354     1201714423.0
+549.3267    63300808.0
+576.2749    7386007.0
+577.3074    2354251.0
+617.2778    2323470.0
+625.4543    4040374.0
+796.9808    13576738.0
+797.9841    6368973.0
+809.9883    12596682.0
+810.9916    6601055.0
+1043.7028   144351468.0
+1044.7068   83271854.0
+1045.706    27998321.0
+1046.7131   6505178.0
+1058.1594   20718345.0
+1059.1626   6608764.0
+1071.1639   15461047.0
+1072.1671   5096642.0
+
+IONMODE: Negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C004
+RETENTION_TIME: 473.48
+TOOL_USED: matchms
+NUM PEAKS: 24
+124.1405    6517662.0
+170.2437    1237313.0
+275.6336    28001849.0
+296.147     190395687.0
+482.3247    145772322.0
+483.3283    36245876.0
+496.34      12577588056.0
+497.3442    3337125302.0
+498.3462    532285213.0
+499.3493    68176083.0
+770.964     49250157.0
+771.9675    22666873.0
+783.9721    9839299.0
+784.9749    3622908.0
+949.6233    8009033.0
+950.6274    3674694.0
+991.6726    1420557258.0
+992.6749    763118028.0
+993.6787    239161906.0
+994.6801    53549573.0
+1017.6897   168186952.0
+1018.6656   120599518.0
+1019.6555   57647644.0
+1020.6591   12469103.0
+
+IONMODE: Negative
+SPECTRUMTYPE: Centroid
+COMPOUND_NAME: C005
+RETENTION_TIME: 41.72
+TOOL_USED: matchms
+NUM PEAKS: 20
+218.1386    14009249.0
+337.0623    88672453.0
+338.0654    8770055.0
+353.0361    37061354.0
+359.0443    48435582.0
+360.0459    5025128.0
+375.018     29159485.0
+376.0216    2740193.0
+381.0261    13522755.0
+396.9999    10317665.0
+417.0027    13822994.0
+418.9966    4386311.0
+432.9764    9779399.0
+438.9851    11307111.0
+440.9796    3364168.0
+454.9592    9820452.0
+456.9603    3774845.0
+470.9263    3632486.0
+512.8989    4072570.0
+572.871     3485486.0
+
b
diff -r e5010b19d64d -r 872d8040f713 test-data/similarity/scores_test6_out.json
--- a/test-data/similarity/scores_test6_out.json Tue Jun 27 14:26:29 2023 +0000
+++ b/test-data/similarity/scores_test6_out.json Thu Oct 12 13:25:30 2023 +0000
[
b'@@ -1,1 +1,1 @@\n-{"__Scores__": true, "is_symmetric": false, "references": [{"scannumber": "-1", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C20H12", "inchikey": "CSHWQDPOILHKBI-UHFFFAOYSA-N", "smiles": "C1=CC2=C3C(=C1)C1=CC=CC4=C1C(=CC=C4)C3=CC=C2", "authors": "Price et al., RECETOX, Masaryk University (CZ)", "instrument": "Q Exactive GC Orbitrap GC-MS/MS", "ionization": "EI+", "license": "CC BY-NC", "compound_name": "Perylene", "retention_time": null, "retention_index": 2886.9, "precursor_mz": 252.09323, "adduct": "[M]+", "collision_energy": "70eV", "instrument_type": "GC-EI-Orbitrap", "charge": 1, "parent_mass": "251.08595400000002", "peak_comments": {"252.09323": "Theoretical m/z 252.093354, Mass diff 0 (0.49 ppm), SMILES C1=CC=2C=CC=C3C4=CC=CC5=CC=CC(C(=C1)C23)=C54, Annotation [C20H12]+, Rule of HR False"}, "num_peaks": "3", "peaks_json": [[250.07765, 0.3282529462971431], [252.09323, 1.0], [253.09656, 0.20573802940517583]]}, {"scannumber": "-1", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C14H10", "inchikey": "YNPNZTXNASCQKK-UHFFFAOYSA-N", "smiles": "C1=CC2=C(C=C1)C1=C(C=CC=C1)C=C2", "authors": "Price et al., RECETOX, Masaryk University (CZ)", "instrument": "Q Exactive GC Orbitrap GC-MS/MS", "ionization": "EI+", "license": "CC BY-NC", "compound_name": "Phenanthrene", "retention_time": null, "retention_index": 1832.9, "precursor_mz": 178.0775, "adduct": "[M]+", "collision_energy": "70eV", "instrument_type": "GC-EI-Orbitrap", "charge": 1, "parent_mass": "177.070224", "peak_comments": {"176.062": "Theoretical m/z 176.0626, Mass diff 0 (0 ppm), Formula C14H8", "177.06982": "Theoretical m/z 177.070425, Mass diff 0 (0 ppm), Formula C14H9", "178.0775": "Theoretical m/z 178.077698, Mass diff 0 (1.11 ppm), SMILES C=1C=CC2=C(C1)C=CC=3C=CC=CC32, Annotation [C14H10]+, Rule of HR False"}, "num_peaks": "5", "peaks_json": [[152.0619, 0.1657993569424221], [176.062, 0.24558560966311757], [177.06982, 0.12764433529926775], [178.0775, 1.0], [179.08078, 0.16394988149600653]]}, {"scannumber": "-1", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C14H10", "inchikey": "MWPLVEDNUUSJAV-UHFFFAOYSA-N", "smiles": "C1=CC2=CC3=C(C=CC=C3)C=C2C=C1", "authors": "Price et al., RECETOX, Masaryk University (CZ)", "instrument": "Q Exactive GC Orbitrap GC-MS/MS", "ionization": "EI+", "license": "CC BY-NC", "compound_name": "Anthracene", "retention_time": null, "retention_index": 1844.4, "precursor_mz": 178.07754, "adduct": "[M]+", "collision_energy": "70eV", "instrument_type": "GC-EI-Orbitrap", "charge": 1, "parent_mass": "177.070264", "peak_comments": {"176.06204": "Theoretical m/z 176.0626, Mass diff 0 (0 ppm), Formula C14H8", "177.06984": "Theoretical m/z 177.070425, Mass diff 0 (0 ppm), Formula C14H9", "178.07754": "Theoretical m/z 178.077698, Mass diff 0 (0.89 ppm), SMILES C=1C=CC=2C=C3C=CC=CC3=CC2C1, Annotation [C14H10]+, Rule of HR False"}, "num_peaks": "5", "peaks_json": [[152.06195, 0.12450313104470498], [176.06204, 0.23295403420236208], [177.06984, 0.1074344883724439], [178.07754, 1.0], [179.08081, 0.1616741186784917]]}, {"scannumber": "-1", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C12H10", "inchikey": "CWRYPZZKDGJXCA-UHFFFAOYSA-N", "smiles": "C1CC2=C3C1=CC=CC3=CC=C2", "authors": "Price et al., RECETOX, Masaryk University (CZ)", "instrument": "Q Exactive GC Orbitrap GC-MS/MS", "ionization": "EI+", "license": "CC BY-NC", "compound_name": "Acenaphthene", "retention_time": null, "retention_index": 1528.3, "precursor_mz": 154.07741, "adduct": "[M]+", "collision_energy": "70eV", "instrument_type": "GC-EI-Orbitrap", "charge": 1, "parent_mass": "153.070134", "peak_comments": {"151.05418": "Theoretical m/z 151.054775, Mass diff 0 (0 ppm), Formula C12H7", "153.06969": "Theoretical m/z 153.070425, Mass diff 0 (0 ppm), Formula C12H9", "154.07741": "Theoretical m/z 154.077698, Mass diff 0 (1.87 ppm), SMILES C=1C=C2C=CC=C3C2=C(C1)CC3, Annotation [C12H10]+, Rule of HR False"}, "num_peaks":'..b'N], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN], [NaN]], "dtype": [["Spec2Vec", "<f8"]]}\n\\ No newline at end of file\n'
b
diff -r e5010b19d64d -r 872d8040f713 test-data/spectral_similarity/test1.json
--- a/test-data/spectral_similarity/test1.json Tue Jun 27 14:26:29 2023 +0000
+++ b/test-data/spectral_similarity/test1.json Thu Oct 12 13:25:30 2023 +0000
[
b'@@ -1,1 +1,1 @@\n-{"__Scores__": true, "is_symmetric": false, "references": [{"scannumber": "1161", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C4H10NO3PS", "inchikey": "YASYVMFAVPKPKE-SECBINFHSA-N", "smiles": "COP(=O)(N=C(O)C)SC", "authors": "Biomarker Analytical Laboratories, RECETOX, Masaryk University (CZ)", "instrument": "LC Orbitrap Fusion Tribrid MS", "ionization": "ESI+", "license": "CC BY-NC", "num_peaks": "16", "compound_name": "Acephate", "retention_time": 1.232997, "precursor_mz": 184.0194, "adduct": "[M+H]+", "instrument_type": "LC-ESI-Orbitrap", "peaks_json": [[90.09368, 1128.0], [93.11512, 1241.0], [95.10279, 1118.0], [101.31465, 1152.0], [102.90688, 1322.0], [103.98039, 1201.0], [112.01607, 12289.0], [112.99994, 38027.0], [115.00399, 1634.0], [124.98121, 922.0], [128.97701, 9208.0], [132.57193, 1350.0], [135.84808, 1428.0], [142.99275, 16419.0], [147.94205, 1750.0], [173.5094, 2353.0]]}, {"scannumber": "2257", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C12H11NO2", "inchikey": "CVXBEEMKQHEXEN-UHFFFAOYSA-N", "smiles": "CN=C(Oc1cccc2c1cccc2)O", "authors": "Biomarker Analytical Laboratories, RECETOX, Masaryk University (CZ)", "instrument": "LC Orbitrap Fusion Tribrid MS", "ionization": "ESI+", "license": "CC BY-NC", "peak_comments": {"145.06491": "Theoretical m/z 145.064787, Mass diff 0 (0.85 ppm), SMILES OC1=CC=CC=2C=CC=CC12, Annotation [C10H8O+H]+, Rule of HR True"}, "num_peaks": "1", "compound_name": "Carbaryl", "retention_time": 5.259445, "precursor_mz": 202.0863, "adduct": "[M+H]+", "instrument_type": "LC-ESI-Orbitrap", "peaks_json": [[145.06491, 1326147.0]]}, {"scannumber": "1516", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C8H16NO5P", "inchikey": "VEENJGZXVHKXNB-UHFFFAOYSA-N", "smiles": "COP(=O)(OC(=CC(=O)N(C)C)C)OC", "authors": "Biomarker Analytical Laboratories, RECETOX, Masaryk University (CZ)", "instrument": "LC Orbitrap Fusion Tribrid MS", "ionization": "ESI+", "license": "CC BY-NC", "peak_comments": {"112.07591": "Theoretical m/z 112.075687, Mass diff 0 (1.99 ppm), SMILES O=C(C=CC)N(C)C, Annotation [C6H11NO-H]+, Rule of HR True", "127.01563": "Theoretical m/z 127.01547, Mass diff 0 (1.26 ppm), SMILES O=P(O)(OC)OC, Annotation [C2H7O4P+H]+, Rule of HR True", "193.02605": "Theoretical m/z 193.026035, Mass diff 0 (0.08 ppm), SMILES O=CC=C(OP(=O)(OC)OC)C, Annotation [C6H11O5P-H]+, Rule of HR True", "238.08437": "Theoretical m/z 238.083891, Mass diff 0 (2.01 ppm), SMILES O=C(C=C(OP(=O)(OC)OC)C)N(C)C, Annotation [C8H16NO5P+H]+, Rule of HR True"}, "num_peaks": "5", "compound_name": "Dicrotophos", "retention_time": 2.025499, "precursor_mz": 238.0844, "adduct": "[M+H]+", "instrument_type": "LC-ESI-Orbitrap", "peaks_json": [[112.074, 102027.0], [112.07591, 9070987.0], [127.01563, 3230337.0], [193.02605, 7897744.0], [238.08437, 2973124.0]]}, {"scannumber": "1865", "ionmode": "positive", "spectrumtype": "Centroid", "formula": "C5H12NO3PS2", "inchikey": "MCWXGJITAZMZEV-UHFFFAOYSA-N", "smiles": "CN=C(CSP(=S)(OC)OC)O", "authors": "Biomarker Analytical Laboratories, RECETOX, Masaryk University (CZ)", "instrument": "LC Orbitrap Fusion Tribrid MS", "ionization": "ESI+", "license": "CC BY-NC", "peak_comments": {"88.0219": "Theoretical m/z 88.021549, Mass diff 0 (3.99 ppm), SMILES SCC=NC, Annotation [C3H7NS-H]+, Rule of HR True", "124.98233": "Theoretical m/z 124.982067, Mass diff 0 (2.11 ppm), SMILES S=P(OC)OC, Annotation [C2H7O2PS-H]+, Rule of HR True", "142.99275": "Theoretical m/z 142.993177, Mass diff 0 (0 ppm), Formula C2H8O3PS", "156.95422": "Theoretical m/z 156.954136, Mass diff 0 (0.54 ppm), SMILES S=P(S)(OC)OC, Annotation [C2H7O2PS2-H]+, Rule of HR True", "170.97": "Theoretical m/z 170.969791, Mass diff 0 (1.22 ppm), SMILES S=P(OC)(OC)SC, Annotation [C3H9O2PS2-H]+, Rule of HR True", "197.98123": "Theoretical m/z 197.980686, Mass diff 0.001 (2.75 ppm), SMILES S=P(OC)SCC(O)=NC, Annotation [C4H10NO2PS2-H]+, Rule of HR True", "198.96501": "Theoretical m/z 198'..b'.2905090298183795, 5], [0.0007786069695307855, 1], [0.0005160257799646218, 2], [0.00038736811468175473, 3], [0.004167449964456371, 2], [0.002043269072734401, 2], [0.0007317319771959746, 4], [1.9838321402668794e-05, 1], [8.801820482468617e-06, 1], [9.255871433471165e-05, 1], [0.00027188763979513214, 1], [4.4947603414879724e-05, 1], [0.009400379409723776, 2], [0.005755388742032419, 5], [0.0007186188027915625, 4], [0.0002820277712237843, 1], [0.0021840777388529727, 3], [0.0021623907785408305, 3], [0.0007283436921234106, 1], [0.0008441009586501761, 1], [0.0002073559764999649, 1], [0.0007416252801498897, 1], [0.00015068661266851979, 1], [8.813547345301928e-05, 1], [9.26696450737907e-05, 1], [0.001547967118838771, 1], [0.05550228874171714, 5], [0.02125475011618532, 3], [0.011654725377056363, 2], [0.048940875059941075, 1], [0.09674974767603109, 4], [0.00012831189436302386, 1], [0.01967432143668846, 1], [0.01205243016929935, 1], [0.0023135823935159366, 4], [0.005962405101607889, 3], [0.004500417483116905, 5], [0.29457761231941254, 5], [6.06340812854675e-05, 2], [0.0006948984533266634, 1], [0.0007187196610315783, 2], [0.001696224336237004, 4], [0.004199029365250834, 3], [5.80297555852376e-05, 2], [0.002157838608906188, 2], [0.0006784306910464165, 5], [2.6175190827127028e-05, 1], [5.3148579647693586e-05, 3], [0.002344175993704721, 2], [0.0007030269709451109, 2], [6.932179507489616e-05, 2], [0.009409539133589516, 4], [0.007853290243932564, 8], [0.0004251026224145782, 3], [0.004851525009980412, 3], [0.002316715000724008, 4], [0.0024536891814221423, 5], [0.0009568700243380552, 1], [1.2682782292908576e-05, 1], [0.00035372089119100686, 1], [0.00019532961465025935, 1], [0.0007449280704076819, 1], [0.0006070307887188872, 3], [0.008405179318406656, 2], [0.06059234768905473, 5], [0.04007587841437244, 7], [0.011192609898471699, 3], [0.05007411887473937, 1], [0.11083177441713818, 4], [0.00041472952800485167, 2], [0.0002443181917148464, 1], [0.018417594020478472, 1], [0.010756665957563824, 1], [0.015590543256883976, 2], [0.04566648744942041, 1], [5.931306182041711e-06, 1], [0.03353471802351476, 2], [0.0016641176911516493, 1], [0.011667998101389831, 2], [0.00013501455706461823, 2], [0.0005641822404665421, 1], [0.013565745106156183, 3], [0.0022081590773529217, 2], [2.755827193743529e-05, 1], [0.0007386957430155464, 1], [0.004966786726868895, 2], [0.0010661461087560196, 2], [0.001063292846857611, 1], [0.000606881881809848, 1], [0.002186896434193792, 1], [0.12865769017257328, 1], [0.028190890883599714, 4], [0.0005044163468766756, 1], [0.002886337967280529, 1], [0.0559715028711476, 1], [0.0002158797917780897, 1], [0.00620360880796666, 2], [0.0007355365822429641, 3], [0.0007942591768873767, 1], [0.00010209303249019821, 2], [0.00019287244985231734, 1], [0.00016324733223990588, 2], [9.721355793156336e-06, 1], [4.652016835129338e-05, 1], [0.019877538355056665, 1], [0.0021026011257930747, 1], [0.0008375181541441654, 3], [2.8019476783829662e-05, 1], [0.00126319178728046, 2], [0.033408666281750724, 1], [0.00022237422701605942, 1], [0.002265272552324613, 1], [0.000234866112352408, 1], [0.0009646911431102235, 2], [0.0001427118212822249, 1], [0.0015640758643172626, 1], [0.027845060459883365, 1], [0.0005869184307697094, 1], [0.002177606309783109, 1], [0.0011879375171201744, 2], [0.00024257932792313028, 2], [0.029015752159248828, 1], [0.012882620708157606, 1], [0.09087822963404141, 1], [0.016369110194600803, 2], [0.0009837491848604097, 1], [0.0010871277430062854, 1], [0.006379222433724256, 2], [0.07844880045683728, 1], [0.003552995338630374, 1], [0.00675497126450081, 1], [0.020492592767589624, 1], [0.0004882302811255579, 1], [0.03692178564115823, 1], [0.05107327151527259, 1], [0.0013454548424030402, 1], [0.00023077527573114648, 1], [0.0034920044529350115, 1], [0.05588674606358348, 1], [0.10402095547417871, 1], [0.004711174266112351, 1]], "dtype": [["CosineGreedy_0.1_0.0_1.0_scores", "<f8"], ["CosineGreedy_0.1_0.0_1.0_matches", "<i8"]]}\n\\ No newline at end of file\n'