Repository 'recetox_aplcms_generate_feature_table'
hg clone https://toolshed.g2.bx.psu.edu/repos/recetox/recetox_aplcms_generate_feature_table

Changeset 0:2810c956ec39 (2023-02-13)
Next changeset 1:e8962c8340f4 (2023-04-03)
Commit message:
planemo upload for repository https://github.com/RECETOX/galaxytools/tree/master/tools/recetox_aplcms commit 506df2aef355b3791567283e1a175914f06b405a
added:
help.xml
images/scheme.png
macros.xml
mzml_id_getter.py
recetox_aplcms_generate_feature_table.xml
test-data/corrected_expected/corrected_0.parquet
test-data/corrected_expected/corrected_1.parquet
test-data/corrected_expected/corrected_2.parquet
test-data/extracted_expected/extracted_0.parquet
test-data/extracted_expected/extracted_1.parquet
test-data/extracted_expected/extracted_2.parquet
test-data/hybrid.recetox.parquet
test-data/hybrid_recovered_feature_sample_table.parquet
test-data/int_cross_table.parquet
test-data/known_table.parquet
test-data/mbr_test0.mzml
test-data/mbr_test0_copy.mzml
test-data/mbr_test1.mzml
test-data/mbr_test2.mzml
test-data/rt_cross_table.parquet
test-data/tolerances.parquet
test-data/two_step_hybrid.recetox.parquet
test-data/two_step_hybrid_info.csv
test-data/unsupervised_output/corrected_features_0.parquet
test-data/unsupervised_output/corrected_features_1.parquet
test-data/unsupervised_output/corrected_features_2.parquet
test-data/unsupervised_output/extracted_features_0.parquet
test-data/unsupervised_output/extracted_features_1.parquet
test-data/unsupervised_output/extracted_features_2.parquet
test-data/unsupervised_output/unsupervised.recetox.parquet
test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet
test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet
utils.R
b
diff -r 000000000000 -r 2810c956ec39 help.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/help.xml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,255 @@\n+<macros>\n+\n+<token name="@GENERAL_HELP@">\n+General Information\n+===================\n+\n+Overview\n+--------\n+       \n+recetox-aplcms is a software package for peak detection in high resolution mass spectrometry (HRMS) data.\n+It supports reading .mzml files in raw profile mode and uses a bi-Gaussian chromatographic peak shape for feature detection and quantification.\n+\n+recetox-aplcms is based on the apLCMS package developed by Tianwei Yu at Emory University - see the citations and the apLCMS section beneath.\n+This version includes various software updates and is actively developed and maintained on `GitHub`_.\n+Please submit eventual bug reports as `issues`_ on the repository.\n+\n+.. _GitHub: https://github.com/RECETOX/recetox-aplcms\n+.. _issues: https://github.com/RECETOX/recetox-aplcms/issues/new\n+\n+\n+Workflow\n+--------\n+                   \n+.. image:: https://raw.githubusercontent.com/RECETOX/galaxytools/aee0dd6cf6c05936269efe4337c50e27cc68e86b/tools/recetox_aplcms/images/scheme.png\n+   :width: 2560\n+   :height: 788\n+   :scale: 40\n+   :alt: A picture of a workflow diagram.\n+\n+The individual steps of the recetox-aplcms package can be combined in 2 separate workflows processing HRMS data in an unsupervised manner or by including a-priori knowledge.\n+The workflows consist of the following building blocks:\n+\n+(1) remove noise - denoise the raw data and extract the EIC\n+(2) generate feature table - group features in EIC into peaks using peak-shape model\n+(3) compute clusters - compute mz and rt clusters across samples\n+(4) compute template - find the template for rt correction\n+(5) correct time - correct the rt across samples using splines\n+(6) align features - align identical features across samples\n+(7) recover weaker signals - recover missed features in samples based on the aligned features\n+(8) merge known table - add known features to detected features table and vice versa\n+\n+For detailed documentation on the individual steps please see the individual tool wrappers.\n+\n+\n+apLCMS (Original Reference)\n+---------------------------\n+       \n+apLCMS is a software which generates a feature table from a batch of LC/MS spectra. The m/z and retention time\n+tolerance levels are estimated from the data. A run-filter is used to detect peaks and remove noise.\n+Non-parametric statistical methods are used to find-tune peak selection and grouping. After retention time\n+correction, a feature table is generated by aligning peaks across spectra. For further information on apLCMS\n+please refer to https://mypage.cuhk.edu.cn/academics/yutianwei/apLCMS/.\n+</token>\n+\n+<token name="@REMOVE_NOISE_HELP@">\n+recetox-aplcms - remove noise\n+=============================\n+\n+This tool is the first step of recetox-aplcms.\n+It removes noise from the raw data and performs a first clustering step of points with close m/z values into the extracted ion chromatograms (EICs).\n+Only peaks with a minimum elution length of `min_run` seconds are kept.\n+\n+Example Output\n+--------------\n+The raw data points contained in the scans of the `mzml` file are filtered for noise and grouped into clusters based on m/z values.\n+See an example output in the table below. The `group_number` column indicates the cluster index.\n+\n++----------------------+-------------------+-----------------------+--------------------+\n+| mz                   |    rt             |    intensity          |    group_number    |\n++======================+===================+=======================+====================+\n+| 70.01060119055192    |    350.58654      |    21178.330810546875 |    5               |\n++----------------------+-------------------+-----------------------+--------------------+\n+| 70.02334120404554    |    130.175262     |    287869.5478515625  |    10              |\n++----------------------+-------------------+-----------------------+--------------------+\n+| 70.0287408273165     |    134.801352     |    60883.15185546875  |    11              |\n++----------------------+-'..b'   3       |  1                     |  1                     |  1                     |\n++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+\n+|  2    | 70.06505677  |  70.065045   |  70.0650676   |  141.9560055   |  140.5762528  |  143.335758   |   2       |  1                     |  0                     |  1                     |\n++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+\n+|  57   | 78.04643252  |  78.046429   |  78.0464325   |  294.0063397   |  293.9406777  |  294.072001   |   2       |  1                     |  1                     |  0                     |\n++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+\n+|  ...  | ...          |   ...        |  ...          |  ...           |  ...          |  ...          |   ...     |  ...                   |  ...                   |  ...                   |\n++-------+--------------+--------------+---------------+----------------+---------------+---------------+-----------+------------------------+------------------------+------------------------+\n+\n+Intensity Table\n+~~~~~~~~~~~~~~~\n+This table contains the peak area for aligned features in all samples.\n+\n++-------+------------------------+------------------------+------------------------+\n+|  id   |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |\n++=======+========================+========================+========================+\n+|  1    |  13187487.20482895     |  7957395.699119729     |  11700594.397257797    |\n++-------+------------------------+------------------------+------------------------+\n+|  2    |  2075168.6398983458    |  0                     |  2574362.159289044     |\n++-------+------------------------+------------------------+------------------------+\n+|  57   |  2934524.4406785755    |  1333044.5065971944    |  0                     |\n++-------+------------------------+------------------------+------------------------+\n+|  ...  |  ...                   |  ...                   |  ...                   |\n++-------+------------------------+------------------------+------------------------+\n+\n+Retention Time Table\n+~~~~~~~~~~~~~~~~~~~~\n+This table contains the retention times for all aligned features in all samples.\n+\n++-------+------------------------+------------------------+------------------------+\n+|  id   |  21_qc_no_dil_milliq   |  29_qc_no_dil_milliq   |  8_qc_no_dil_milliq    |\n++=======+========================+========================+========================+\n+|  1    |  294.09792478513236    |  294.1499853056912     |  294.0634942428341     |\n++-------+------------------------+------------------------+------------------------+\n+|  2    |  140.57625284242982    |  0                     |  143.33575827589172    |\n++-------+------------------------+------------------------+------------------------+\n+|  57   |  294.07200187644435    |  293.9406777222317     |  0                     |\n++-------+------------------------+------------------------+------------------------+\n+|  ...  |  ...                   |  ...                   |  ...                   |\n++-------+------------------------+------------------------+------------------------+\n+</token>\n+\n+<token name="@MERGE_KNOWN_TABLES_HELP@">\n+recetox-aplcms - merge known table\n+==================================\n+\n+This tool allows merging the detected features back into the table of known features and vice versa.\n+It is used in the hybrid version of recetox-aplcms to augment the aligned feature table with the suspect peaks \n+and to augment this table with successfully detected features.\n+</token>\n+</macros>\n'
b
diff -r 000000000000 -r 2810c956ec39 images/scheme.png
b
Binary file images/scheme.png has changed
b
diff -r 000000000000 -r 2810c956ec39 macros.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,154 @@\n+<macros>\r\n+    <token name="@TOOL_VERSION@">0.10.1</token>\r\n+    <xml name="requirements">\r\n+        <requirements>\r\n+            <requirement type="package" version="@TOOL_VERSION@">r-recetox-aplcms</requirement>\r\n+            <requirement type="package" version="2.5.2">pymzml</requirement>\r\n+        </requirements>\r\n+    </xml>\r\n+\r\n+    <xml name="creator">\r\n+        <creator>\r\n+            <person\r\n+                givenName="Maksym"\r\n+                familyName="Skoryk"\r\n+                url="https://github.com/maximskorik"\r\n+                identifier="0000-0003-2056-8018" />\r\n+            <person\r\n+                givenName="Matej"\r\n+                familyName="Troj\xc3\xa1k"\r\n+                url="https://github.com/xtrojak"\r\n+                identifier="0000-0003-0841-2707" />\r\n+            <person\r\n+                givenName="Martin"\r\n+                familyName="\xc4\x8cech"\r\n+                url="https://github.com/martenson"\r\n+                identifier="0000-0002-9318-1781" />\r\n+            <person\r\n+                givenName="Ji\xc5\x99\xc3\xad"\r\n+                familyName="Novotn\xc3\xbd"\r\n+                url="https://github.com/xtracko"\r\n+                identifier="0000-0001-5449-3523" />\r\n+            <person\r\n+                givenName="Helge"\r\n+                familyName="Hecht"\r\n+                url="https://github.com/hechth"\r\n+                identifier="0000-0001-6744-996X" />\r\n+            <organization\r\n+                url="https://www.recetox.muni.cz/"\r\n+                email="GalaxyToolsDevelopmentandDeployment@space.muni.cz"\r\n+                name="RECETOX MUNI"/>\r\n+        </creator>\r\n+    </xml>\r\n+\r\n+    <xml name="remove_noise_params">\r\n+        <param name="min_pres" type="float" value="0.5" label="min_pres"\r\n+               help="The minimum proportion of presence in the time period for a series of signals grouped by m/z to be considered a peak." />\r\n+        <param name="min_run" type="float" value="12" label="min_run"\r\n+               help="The minimum length of elution time for a series of signals grouped by m/z to be considered a peak." />\r\n+        <param name="mz_tol" type="float" value="1e-05" label="mz_tol"\r\n+               help="The m/z tolerance level for the grouping of data points. This value is expressed as the fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level. The recommended value is the machine\'s nominal accuracy level. Divide the ppm value by 1e6. For FTMS, 1e-5 is recommended." />\r\n+        <param name="baseline_correct" type="float" value="0" label="baseline_correct"\r\n+               help="After grouping the observations, the highest intensity in each group is found. If the highest is lower than this value, the entire group will be deleted. The default value is NA, in which case the program uses a percentile of the height of the noise groups. If given a value, the value will be used as the threshold, and baseline.correct.noise.percentile will be ignored." />\r\n+        <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE" label="intensity_weighted"\r\n+               help="Whether to weight the local density by signal intensities in initial peak detection." />\r\n+    </xml>\r\n+\r\n+    <xml name="generate_feature_table_params">\r\n+        <param name="sd_cut_min" type="float" value="0.01" label="sd_cut_min"\r\n+               help="The minimum standard deviation of a feature to be not eliminated." />\r\n+        <param name="sd_cut_max" type="float" value="500" label="sd_cut_max"\r\n+               help="The maximum standard deviation of a feature to be not eliminated." />\r\n+        <conditional name="shape">\r\n+            <param name="shape_model" type="select" display="radio" label="shape_model"\r\n+                   help="The mathematical model for the shape of a peak. There are two choices - bi-Gaussian and Gaussian. When the peaks are asymmetric, the bi-Gaussian is better.">\r\n+                <option val'..b'elp="Relative m/z tolerance to use for grouping features." />\r\n+                <param name="rt_tol_relative" type="float" optional="true" label="rt_tol_relative"\r\n+                       help="Relative retention time tolerance to use for grouping features." />\r\n+            </when>\r\n+            <when value="file">\r\n+                <param label="Input tolerances values" name="input_tolerances" type="data" format="parquet"\r\n+                       help="Table containing tolerance values." />\r\n+            </when>\r\n+        </conditional>\r\n+        <param name="mz_tol_absolute" type="float" label="mz_tol_absolute" value="1e-05"\r\n+               help="Absolute m/z tolerance to use for grouping features." />\r\n+        <param name="mz_max_diff" type="float" label="mz_max_diff" value="0.01"\r\n+               help="Maximum difference between feature m/z values to belong to the same cluster." />\r\n+    </xml>\r\n+\r\n+    <xml name="recover_weaker_params">\r\n+        <param name="mz_tol" type="float" value="1e-05" label="mz_tol"\r\n+               help="The m/z tolerance level for the grouping of data points. This value is expressed as the\r\n+               fraction of the m/z value. This value, multiplied by the m/z value, becomes the cutoff level.\r\n+               The recommended value is the machine\'s nominal accuracy level. Divide the ppm value by 1e6.\r\n+               For FTMS, 1e-5 is recommended." />\r\n+        <param name="recover_mz_range" type="float" optional="true" label="recover_mz_range"\r\n+               help="The m/z around the feature m/z to search for observations. The default value is NA, in which\r\n+               case 1.5 times the m/z tolerance in the aligned object will be used." />\r\n+        <param name="recover_rt_range" type="float" optional="true" label="recover_rt_range"\r\n+               help="The retention time around the feature retention time to search for observations.\r\n+               The default value is NA, in which case 0.5 times the retention time tolerance in the aligned\r\n+                object will be used." />\r\n+        <param name="use_observed_range" type="boolean" checked="true" truevalue="TRUE" falsevalue="FALSE"\r\n+               label="use_observed_range" help="If the value is true, the actual range of the observed locations of\r\n+               the feature in all the spectra will be used." />\r\n+        <param name="recover_min_count" type="integer" value="3" label="recover_min_count"\r\n+               help="The minimum number of raw data points to be considered as a true feature." />\r\n+        <param name="intensity_weighted" type="boolean" checked="false" truevalue="TRUE" falsevalue="FALSE"\r\n+               label="intensity_weighted" help="Whether to weight the local density by signal intensities in initial peak detection." />\r\n+    </xml>\r\n+\r\n+    <xml name="bandwidth_params">\r\n+        <param name="bandwidth" type="float" value="0.5" label="bandwidth"\r\n+               help="A value between zero and one. Multiplying this value to the length of the signal along\r\n+               the time axis helps determine the bandwidth in the kernel smoother used for peak identification." />\r\n+        <param name="min_bandwidth" type="float" optional="true" label="min_bandwidth"\r\n+               help="The minimum bandwidth to use in the kernel smoother." />\r\n+        <param name="max_bandwidth" type="float" optional="true" label="max_bandwidth"\r\n+               help="The maximum bandwidth to use in the kernel smoother." />\r\n+    </xml>\r\n+\r\n+    <xml name="citations">\r\n+        <citations>\r\n+            <citation type="doi">10.1093/bioinformatics/btp291</citation>\r\n+            <citation type="doi">10.1186/1471-2105-11-559</citation>\r\n+            <citation type="doi">10.1021/pr301053d</citation>\r\n+            <citation type="doi">10.1093/bioinformatics/btu430</citation>\r\n+            <citation type="doi">10.1038/s41598-020-70850-0</citation>\r\n+            <yield />\r\n+        </citations>\r\n+    </xml>\r\n+</macros>\r\n'
b
diff -r 000000000000 -r 2810c956ec39 mzml_id_getter.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/mzml_id_getter.py Mon Feb 13 10:26:41 2023 +0000
[
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+
+from pymzml.run import Reader
+
+
+def main(argv):
+    parser = argparse.ArgumentParser(description='Get run ID from an mzML file.')
+    parser.add_argument('mzml_file', help='Path to an mzML file to get run ID from.')
+    args = parser.parse_args()
+
+    mzml = Reader(args.mzml_file)
+    id = mzml.info['run_id']
+
+    if id is not None:
+        with open("sample_name.txt", mode='x') as f:
+            f.write(id)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1:])
b
diff -r 000000000000 -r 2810c956ec39 recetox_aplcms_generate_feature_table.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/recetox_aplcms_generate_feature_table.xml Mon Feb 13 10:26:41 2023 +0000
[
@@ -0,0 +1,78 @@
+<tool id="recetox_aplcms_generate_feature_table" name="recetox-aplcms - generate feature table" version="@TOOL_VERSION@+galaxy0">
+    <description>generate feature table from noise-removed HRMS profile data</description>
+    <macros>
+        <import>macros.xml</import>
+        <import>help.xml</import>
+    </macros>
+    <expand macro="creator"/>
+    <expand macro="requirements"/>
+
+    <command detect_errors="aggressive"><![CDATA[
+        Rscript -e 'source("${__tool_directory__}/utils.R")' -e 'source("${run_script}")'
+    ]]></command>
+    <configfiles>
+         <configfile name="run_script"><![CDATA[
+             profile <- load_data_from_parquet_file('$profile')
+             sample_name <- load_sample_name(profile)
+
+             if(is.na(sample_name)) {
+                 message("The file does not contain sample name.")
+             }
+
+             feature_table <- prof.to.features(
+                 profile = profile,
+                 bandwidth = $bandwidth,
+                 #if $min_bandwidth:
+                 min_bandwidth = $min_bandwidth,
+                 #else:
+                 min_bandwidth = NA,
+                 #end if
+                 #if $max_bandwidth:
+                 max_bandwidth = $max_bandwidth,
+                 #else:
+                 max_bandwidth = NA,
+                 #end if
+                 sd_cut = c($sd_cut_min, $sd_cut_max),
+                 #if $shape.shape_model == "bi-Gaussian":
+                 sigma_ratio_lim = c($shape.sigma_ratio_lim_min, $shape.sigma_ratio_lim_max),
+                 #else:
+                 sigma_ratio_lim = NA,
+                 #end if
+                 shape_model = "$shape.shape_model",
+                 peak_estim_method = "$peak_estim_method",
+                 component_eliminate = $component_eliminate,
+                 moment_power = $moment_power,
+                 BIC_factor = $BIC_factor,
+                 do.plot = FALSE
+             )
+
+             feature_table <- save_sample_name(feature_table, sample_name)
+             save_data_as_parquet_file(feature_table, '$output_file')
+         ]]></configfile>
+    </configfiles>
+
+    <inputs>
+        <param label="Input profile data" name="profile" type="data" format="parquet"
+               help="Mass spectrometry profile data." />
+        <expand macro="bandwidth_params"/>
+        <expand macro="generate_feature_table_params"/>
+    </inputs>
+
+    <outputs>
+        <data label="${tool.name} on ${on_string}" name="output_file" format="parquet" />
+    </outputs>
+
+    <tests>
+
+    </tests>
+
+    <help>
+        <![CDATA[
+            @GENERATE_FEATURE_TABLE_HELP@
+
+            @GENERAL_HELP@
+        ]]>
+    </help>
+
+    <expand macro="citations"/>
+</tool>
b
diff -r 000000000000 -r 2810c956ec39 test-data/corrected_expected/corrected_0.parquet
b
Binary file test-data/corrected_expected/corrected_0.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/corrected_expected/corrected_1.parquet
b
Binary file test-data/corrected_expected/corrected_1.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/corrected_expected/corrected_2.parquet
b
Binary file test-data/corrected_expected/corrected_2.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/extracted_expected/extracted_0.parquet
b
Binary file test-data/extracted_expected/extracted_0.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/extracted_expected/extracted_1.parquet
b
Binary file test-data/extracted_expected/extracted_1.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/extracted_expected/extracted_2.parquet
b
Binary file test-data/extracted_expected/extracted_2.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/hybrid.recetox.parquet
b
Binary file test-data/hybrid.recetox.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/hybrid_recovered_feature_sample_table.parquet
b
Binary file test-data/hybrid_recovered_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/int_cross_table.parquet
b
Binary file test-data/int_cross_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/known_table.parquet
b
Binary file test-data/known_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/mbr_test0.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test0.mzml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 2810c956ec39 test-data/mbr_test0_copy.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test0_copy.mzml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 2810c956ec39 test-data/mbr_test1.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test1.mzml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 2810c956ec39 test-data/mbr_test2.mzml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/mbr_test2.mzml Mon Feb 13 10:26:41 2023 +0000
b
b'@@ -0,0 +1,26908 @@\n+<?xml version="1.0" encoding="utf-8"?>\n+<indexedmzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.2_idx.xsd">\n+  <mzML xmlns="http://psi.hupo.org/ms/mzml" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://psi.hupo.org/ms/mzml http://psidev.info/files/ms/mzML/xsd/mzML1.1.0.xsd" id="2016_Jan_12_QE2_47" version="1.1.0">\n+    <cvList count="2">\n+      <cv id="MS" fullName="Proteomics Standards Initiative Mass Spectrometry Ontology" version="3.74.0" URI="http://psidev.cvs.sourceforge.net/*checkout*/psidev/psi/psi-ms/mzML/controlledVocabulary/psi-ms.obo"/>\n+      <cv id="UO" fullName="Unit Ontology" version="12:10:2011" URI="http://obo.cvs.sourceforge.net/*checkout*/obo/obo/ontology/phenotype/unit.obo"/>\n+    </cvList>\n+    <fileDescription>\n+      <fileContent>\n+        <cvParam cvRef="MS" accession="MS:1000579" name="MS1 spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000127" name="centroid spectrum" value=""/>\n+        <cvParam cvRef="MS" accession="MS:1000580" name="MSn spectrum" value=""/>\n+      </fileContent>\n+      <sourceFileList count="3">\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.raw" name="2016_Jan_12_QE2_47.raw" location="">\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="6f65a1287b2d7522c5fb90a8a5304212b3568b67"/>\n+          <cvParam cvRef="MS" accession="MS:1000563" name="Thermo RAW format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000768" name="Thermo nativeID format" value=""/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzXML" name="2016_Jan_12_QE2_47.mzXML" location="file:///">\n+          <cvParam cvRef="MS" accession="MS:1000776" name="scan number only nativeID format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000566" name="ISB mzXML format" value=""/>\n+          <cvParam cvRef="MS" accession="MS:1000569" name="SHA-1" value="d9c6278c1c6a8f8ca6c69133bacd00efc5f2e706"/>\n+        </sourceFile>\n+        <sourceFile id="_x0032_016_Jan_12_QE2_47.mzML" name="2016_Jan_12_QE2_47.mzML" location="file:////Users/kumar207/Documents/Projects/Pratik/script/../mzML">\n+        </sourceFile>\n+      </sourceFileList>\n+    </fileDescription>\n+    <softwareList count="4">\n+      <software id="Xcalibur_x0020_software" version="2.6-264001/2.6.0.2640">\n+        <cvParam cvRef="MS" accession="MS:1000532" name="Xcalibur" value=""/>\n+      </software>\n+      <software id="ReAdW_x0020_software" version="2015.1.0(build Jun 29 2015 14:09:50)">\n+        <cvParam cvRef="MS" accession="MS:1000541" name="ReAdW" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.8990" version="3.0.8990">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+      <software id="pwiz_3.0.7680" version="3.0.7680">\n+        <cvParam cvRef="MS" accession="MS:1000615" name="ProteoWizard software" value=""/>\n+      </software>\n+    </softwareList>\n+    <instrumentConfigurationList count="1">\n+      <instrumentConfiguration id="IC1">\n+        <userParam name="msManufacturer" value="Thermo Finnigan"/>\n+        <userParam name="msModel" value="unknown"/>\n+        <componentList count="3">\n+          <source order="1">\n+            <userParam name="msIonisation" value="NSI"/>\n+          </source>\n+          <analyzer order="1">\n+            <cvParam cvRef="MS" accession="MS:1000079" name="fourier transform ion cyclotron resonance mass spectrometer" value=""/>\n+          </analyzer>\n+          <detector order="1">\n+            <userParam name="msDetector" value="unknown"/>\n+          </detector>\n+        </componentList>\n+        <softwareRef ref="Xcalibur_x0020_software"/>\n+      </instrumentConfiguration>\n+    </instrumentConfigurationList>\n+    <dataProcessingList count="3">\n+      <dataProcessing id="dataProces'..b'ntrollerNumber=1 scan=90261">2414719</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90324">2418981</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90325">2422771</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90389">2426417</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90825">2430097</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90896">2434045</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90952">2438179</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=90967">2441823</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91020">2445963</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91036">2449647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91084">2453875</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91100">2457676</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91151">2462056</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91167">2465804</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91234">2469798</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91304">2473755</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91379">2477398</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=91464">2481055</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92113">2484679</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92168">2488404</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92183">2492017</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92238">2495975</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92249">2499652</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92303">2503647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92318">2507303</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=92389">2511184</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94116">2514904</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94183">2518552</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94258">2522503</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94332">2526517</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94729">2530126</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94857">2534084</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94919">2538185</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=94980">2541937</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=95063">2545506</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96694">2549319</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96764">2552999</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96836">2556647</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96918">2560122</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=96993">2563742</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=97169">2567598</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99329">2571216</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99385">2574908</offset>\n+      <offset idRef="controllerType=0 controllerNumber=1 scan=99446">2578708</offset>\n+    </index>\n+    <index name="chromatogram">\n+    </index>\n+  </indexList>\n+  <indexListOffset>2582386</indexListOffset>\n+  <fileChecksum>23b8703ddaf744e69e1ba72be2584fd3df6f2dc4</fileChecksum>\n+</indexedmzML>\n'
b
diff -r 000000000000 -r 2810c956ec39 test-data/rt_cross_table.parquet
b
Binary file test-data/rt_cross_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/tolerances.parquet
b
Binary file test-data/tolerances.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/two_step_hybrid.recetox.parquet
b
Binary file test-data/two_step_hybrid.recetox.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/two_step_hybrid_info.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/two_step_hybrid_info.csv Mon Feb 13 10:26:41 2023 +0000
b
@@ -0,0 +1,5 @@
+sample_name, batch
+mbr_test0, 1
+mbr_test1, 1
+mbr_test2, 2
+mbr_test0_copy, 2
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/corrected_features_0.parquet
b
Binary file test-data/unsupervised_output/corrected_features_0.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/corrected_features_1.parquet
b
Binary file test-data/unsupervised_output/corrected_features_1.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/corrected_features_2.parquet
b
Binary file test-data/unsupervised_output/corrected_features_2.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/extracted_features_0.parquet
b
Binary file test-data/unsupervised_output/extracted_features_0.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/extracted_features_1.parquet
b
Binary file test-data/unsupervised_output/extracted_features_1.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/extracted_features_2.parquet
b
Binary file test-data/unsupervised_output/extracted_features_2.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/unsupervised.recetox.parquet
b
Binary file test-data/unsupervised_output/unsupervised.recetox.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet
b
Binary file test-data/unsupervised_output/unsupervised_aligned_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet
b
Binary file test-data/unsupervised_output/unsupervised_recovered_feature_sample_table.parquet has changed
b
diff -r 000000000000 -r 2810c956ec39 utils.R
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/utils.R Mon Feb 13 10:26:41 2023 +0000
[
@@ -0,0 +1,125 @@
+library(recetox.aplcms)
+
+get_env_sample_name <- function() {
+    sample_name <- Sys.getenv("SAMPLE_NAME", unset = NA)
+    if (nchar(sample_name) == 0) {
+        sample_name <- NA
+    }
+    if (is.na(sample_name)) {
+        message("The mzML file does not contain run ID.")
+    }
+    return(sample_name)
+}
+
+save_sample_name <- function(df, sample_name) {
+    attr(df, "sample_name") <- sample_name
+    return(df)
+}
+
+load_sample_name <- function(df) {
+    sample_name <- attr(df, "sample_name")
+    if (is.null(sample_name)) {
+        return(NA)
+    } else {
+        return(sample_name)
+    }
+}
+
+save_data_as_parquet_file <- function(data, filename) {
+    arrow::write_parquet(data, filename)
+}
+
+load_data_from_parquet_file <- function(filename) {
+    return(arrow::read_parquet(filename))
+}
+
+load_parquet_collection <- function(files) {
+    features <- lapply(files, arrow::read_parquet)
+    features <- lapply(features, tibble::as_tibble)
+    return(features)
+}
+
+save_parquet_collection <- function(table, sample_names, subdir) {
+    dir.create(subdir)
+    for (i in seq_len(length(table$feature_tables))) {
+      filename <- file.path(subdir, paste0(subdir, "_", sample_names[i], ".parquet"))
+      feature_table <- as.data.frame(table$feature_tables[[i]])
+      feature_table <- save_sample_name(feature_table, sample_names[i])
+      arrow::write_parquet(feature_table, filename)
+    }
+}
+
+sort_by_sample_name <- function(tables, sample_names) {
+    return(tables[order(sample_names)])
+}
+
+save_tolerances <- function(table, tol_file) {
+    mz_tolerance <- c(table$mz_tol_relative)
+    rt_tolerance <- c(table$rt_tol_relative)
+    arrow::write_parquet(data.frame(mz_tolerance, rt_tolerance), tol_file)
+}
+
+get_mz_tol <- function(tolerances) {
+    return(tolerances$mz_tolerance)
+}
+
+get_rt_tol <- function(tolerances) {
+    return(tolerances$rt_tolerance)
+}
+
+save_aligned_features <- function(aligned_features, metadata_file, rt_file, intensity_file) {
+    save_data_as_parquet_file(aligned_features$metadata, metadata_file)
+    save_data_as_parquet_file(aligned_features$rt, rt_file)
+    save_data_as_parquet_file(aligned_features$intensity, intensity_file)
+}
+
+select_table_with_sample_name <- function(tables, sample_name) {
+    sample_names <- lapply(tables, load_sample_name)
+    index <- which(sample_names == sample_name)
+    if (length(index) > 0) {
+        return(tables[[index]])
+    } else {
+        stop(sprintf("Mismatch - sample name '%s' not present in %s",
+                     sample_name, paste(sample_names, collapse = ", ")))
+    }
+}
+
+select_adjusted <- function(recovered_features) {
+    return(recovered_features$adjusted_features)
+}
+
+known_table_columns <- function() {
+  c("chemical_formula", "HMDB_ID", "KEGG_compound_ID", "mass", "ion.type",
+    "m.z", "Number_profiles_processed", "Percent_found", "mz_min", "mz_max",
+    "RT_mean", "RT_sd", "RT_min", "RT_max", "int_mean(log)", "int_sd(log)",
+    "int_min(log)", "int_max(log)")
+}
+
+save_known_table <- function(table, filename) {
+  columns <- known_table_columns()
+  arrow::write_parquet(table$known_table[columns], filename)
+}
+
+read_known_table <- function(filename) {
+  arrow::read_parquet(filename, col_select = known_table_columns())
+}
+
+save_pairing <- function(table, filename) {
+  df <- table$pairing %>% as_tibble() %>% setNames(c("new", "old"))
+  arrow::write_parquet(df, filename)
+}
+
+join_tables_to_list <- function(metadata, rt_table, intensity_table) {
+  features <- new("list")
+  features$metadata <- metadata
+  features$intensity <- intensity_table
+  features$rt <- rt_table
+  return(features)
+}
+
+validate_sample_names <- function(sample_names) {
+    if ((any(is.na(sample_names))) || (length(unique(sample_names)) != length(sample_names))) {
+        stop(sprintf("Sample names absent or not unique - provided sample names: %s",
+                     paste(sample_names, collapse = ", ")))
+    }
+}