Repository 'ena_upload'
hg clone https://toolshed.g2.bx.psu.edu/repos/iuc/ena_upload

Changeset 1:57251c760cab (2021-04-30)
Previous changeset 0:382518f24d6d (2020-11-28) Next changeset 2:9e2df763086c (2021-07-15)
Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ffea061c1ad6e7291abfe220230dbdbe8d19a2bd"
modified:
ena_upload.xml
extract_tables.py
process_xlsx.py
samples_macros.xml
added:
test-data/2.fastqsanger.gz
b
diff -r 382518f24d6d -r 57251c760cab ena_upload.xml
--- a/ena_upload.xml Sat Nov 28 09:45:44 2020 +0000
+++ b/ena_upload.xml Fri Apr 30 12:09:25 2021 +0000
[
b'@@ -1,24 +1,31 @@\n-<tool id="ena_upload" name="ENA Upload tool" version="0.3" profile="20.01" license="MIT">\n+<tool id="ena_upload" name="ENA Upload tool" version="0.3.1" profile="20.01" license="MIT">\n     <macros>\n-        <token name="@VERSION@">0.2.4</token>\n+        <token name="@VERSION@">0.2.7</token>\n         <import>samples_macros.xml</import>\n     </macros>\n     <requirements>\n         <requirement type="package" version="@VERSION@">ena-upload-cli</requirement>\n         <requirement type="package" version="1.2.0">xlrd</requirement>\n     </requirements>\n+    <stdio>\n+        <regex match="Oops" source="stderr" level="fatal"/>\n+        <regex match="different file names between command line and RUN table" source="stderr" level="fatal"/>\n+    </stdio>\n     <command detect_errors="exit_code"><![CDATA[\n mkdir ./submission_files; \n+#set $studies_table_path = \'./submission_files/studies.tsv\'\n+#set $samples_table_path =   \'./submission_files/samples.tsv\'\n+#set $experiments_table_path = \'./submission_files/experiments.tsv\'\n+#set $runs_table_path =  \'./submission_files/runs.tsv\'\n+        \n+#set $studies_table_path_updated = \'./submission_files/studies_updated.tsv\'\n+#set $samples_table_path_updated =   \'./submission_files/samples_updated.tsv\'\n+#set $experiments_table_path_updated = \'./submission_files/experiments_updated.tsv\'\n+#set $runs_table_path_updated =  \'./submission_files/runs_updated.tsv\'\n \n #set working_dir = os.getcwd()\n-#set $dry_run_option = "False"\n-#set viral_submission = "False"\n #if $action_options.input_format_conditional.input_format == "build_tables":\n   python \'$__tool_directory__/extract_tables.py\' --action $action_options.action --out_dir ./submission_files --studies $studies_json;\n-  #set $studies_table_path = \'./submission_files/studies.tsv\'\n-  #set $samples_table_path =   \'./submission_files/samples.tsv\'\n-  #set $experiments_table_path = \'./submission_files/experiments.tsv\'\n-  #set $runs_table_path =  \'./submission_files/runs.tsv\'\n #end if\n \n #if $action_options.input_format_conditional.input_format == "excel_tables":\n@@ -26,35 +33,30 @@\n     #if $action_options.input_format_conditional.viral_submission == "true":\n         --vir \n     #end if\n-    --action \'$action_options.action\' --form \'$action_options.input_format_conditional.xlsx_file\' --out_dir ./submission_files ;\n-    #set $studies_table_path = \'./submission_files/studies.tsv\'\n-    #set $samples_table_path =   \'./submission_files/samples.tsv\'\n-    #set $experiments_table_path = \'./submission_files/experiments.tsv\'\n-    #set $runs_table_path =  \'./submission_files/runs.tsv\'\n-    #if $action_options.input_format_conditional.dry_run == "true":\n-      #set $dry_run_option = "True"\n-    #end if\n+    --action \'$action_options.action\' --form \'$action_options.input_format_conditional.xlsx_file\' --out_dir ./submission_files --verbose > \'$output\';\n #end if\n \n #if $action_options.input_format_conditional.input_format != "user_generated_tables":\n-    cp $studies_table_path $studies_table_out;\n-    cp $samples_table_path $samples_table_out;\n-    cp $experiments_table_path $experiments_table_out;\n-    cp $runs_table_path $runs_table_out;\n-    #if $action_options.input_format_conditional.dry_run == "true":\n-      #set $dry_run_option = "True"\n-    #end if\n+    cp $studies_table_path $studies_table_out &&\n+    cp $samples_table_path $samples_table_out &&\n+    cp $experiments_table_path $experiments_table_out &&\n+    cp $runs_table_path $runs_table_out &&\n+#else:\n+    ln -s \'$action_options.input_format_conditional.experiments_users_table\' $experiments_table_path &&\n+    ln -s \'$action_options.input_format_conditional.studies_users_table\' $studies_table_path &&\n+    ln -s \'$action_options.input_format_conditional.runs_users_table\' $runs_table_path &&\n+    ln -s \'$action_options.input_format_conditional.samples_users_table\' $samples_table_path &&\n #end if\n \n \n-#if $dry_run_option == "False" and $action_options.test_submit == "False":\n+#if $action_options.test_sub'..b'\n@@ -428,13 +508,15 @@\n         <test expect_failure="true">\n             <conditional name="action_options">\n                 <param name="action" value="add"/>\n-                <param name="submit_dev" value="True"/>\n+                <section name="test_submit_parameters">\n+                    <param name="submit_dev" value="true" />\n+                    <param name="dry_run" value="false" />\n+                </section>\n                 <param name="test_submit" value="True"/>\n                 <conditional name="input_format_conditional">\n                     <param name="input_format" value="build_tables"/>\n-                    <param name="dry_run" value="False"/>\n                     <conditional name="conditional_viral_metadata">\n-                        <param name="viral_sample" value="False"/>\n+                        <param name="viral_sample" value="false"/>\n                         <repeat name="rep_study">\n                             <param name="study_title" value="Test study title"/>\n                             <param name="study_abstract" value="Test study abstract"/>\n@@ -457,6 +539,7 @@\n                                     <param name="platform" value="ILLUMINA"/>\n                                     <param name="instrument_model" value="Illumina HiSeq 4000"/>\n                                     <repeat name="rep_runs">\n+                                        <param name="run_base_name" value="run_from_hospital_X"/>\n                                         <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/>\n                                     </repeat>\n                                 </repeat>\n@@ -472,18 +555,20 @@\n                 <has_text_matching expression="--action \'add\' --center \'Some research center\'"/>\n             </assert_command>\n             <assert_stderr>\n-                <has_text_matching expression="ENA_upload: error: Oops, the file test_fake_path does not exist"/>\n+                <has_text_matching expression="Oops, the file test_fake_path does not exist"/>\n             </assert_stderr>\n         </test>\n         <!--test viral submission - User input metadata-->\n         <test expect_failure="true">\n             <conditional name="action_options">\n                 <param name="action" value="add"/>\n-                <param name="submit_dev" value="False"/>\n+                <section name="test_submit_parameters">\n+                    <param name="submit_dev" value="false" />\n+                    <param name="dry_run" value="false" />\n+                </section>\n                 <param name="test_submit" value="True"/>\n                 <conditional name="input_format_conditional">\n                     <param name="input_format" value="build_tables"/>\n-                    <param name="dry_run" value="False"/>\n                     <conditional name="conditional_viral_metadata">\n                         <param name="viral_sample" value="True"/>\n                         <repeat name="rep_study">\n@@ -518,6 +603,7 @@\n                                     <param name="platform" value="ILLUMINA"/>\n                                     <param name="instrument_model" value="Illumina HiSeq 4000"/>\n                                     <repeat name="rep_runs">\n+                                        <param name="run_base_name" value="run_from_hospital_X"/>\n                                         <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/>\n                                     </repeat>\n                                 </repeat>\n@@ -534,7 +620,7 @@\n                 <has_text_matching expression="--vir"/>\n             </assert_command>\n             <assert_stderr>\n-                <has_text_matching expression="ENA_upload: error: Oops, the file test_fake_path does not exist"/>\n+                <has_text_matching expression="Oops, the file test_fake_path does not exist"/>\n             </assert_stderr>\n         </test>\n     </tests>\n'
b
diff -r 382518f24d6d -r 57251c760cab extract_tables.py
--- a/extract_tables.py Sat Nov 28 09:45:44 2020 +0000
+++ b/extract_tables.py Fri Apr 30 12:09:25 2021 +0000
[
@@ -3,6 +3,11 @@
 import pathlib
 from datetime import datetime
 
+"""
+Parse the configfile generated by the Galaxy tool.
+This file is JSON-formatted and should be converted to a set of tabular files.
+"""
+
 FILE_FORMAT = 'fastq'
 
 parser = argparse.ArgumentParser()
@@ -78,11 +83,15 @@
                                                'submission_date_ENA']) + '\n')
             run_index = 0
             # exp['runs'] is a list of lists
-            for run in exp['runs']:
+            for (base_run, run_files) in exp['runs']:
                 run_index += 1
-                run_alias = '.'.join(['run_' + str(run_index), str(exp_index), str(sample_index)]) \
-                            + '_' + timestamp
-                for file_entry in run:
+                if base_run != '':
+                    run_alias = base_run
+                else:
+                    # no alias provided, generated a unique one
+                    run_alias = '_'.join(['run_' + str(run_index), str(exp_index),
+                                          str(sample_index)]) + '_' + timestamp
+                for file_entry in run_files:
                     runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias,
                                                 file_entry, FILE_FORMAT, 'file_checksum',
                                                 'submission_date_ENA']) + '\n')
b
diff -r 382518f24d6d -r 57251c760cab process_xlsx.py
--- a/process_xlsx.py Sat Nov 28 09:45:44 2020 +0000
+++ b/process_xlsx.py Fri Apr 30 12:09:25 2021 +0000
[
@@ -3,7 +3,7 @@
 import sys
 
 import xlrd
-
+import yaml
 
 FILE_FORMAT = 'fastq'
 
@@ -36,15 +36,45 @@
             sheet_col_index = sheet_columns[expected_columns[col]]
             row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value
         # should check for duplicate alias/ids?
-        data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
+        if xl_sheet.cell(row_id, index_col).value in data_dict.keys():
+            tmp = data_dict[xl_sheet.cell(row_id, index_col).value]
+            data_dict[xl_sheet.cell(row_id, index_col).value] = [tmp]
+            data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict)
+        else:
+            data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict
     return data_dict
 
 
+def paste_xls2yaml(xlsx_path):
+    print('YAML -------------')
+    xls = xlrd.open_workbook(xlsx_path)
+    content_dict = {}
+    for sheet_name in xls.sheet_names():
+        if sheet_name == 'controlled_vocabulary':
+            continue
+        xls_sheet = xls.sheet_by_name(sheet_name)
+        sheet_contents_dict = {}
+        colnames = []
+        for col in range(xls_sheet.ncols):
+            colnames.append(xls_sheet.cell(0, col).value)
+        # skip first 2 rows (column names and suggestions)
+        for row_id in range(2, xls_sheet.nrows):
+            row_dict = {}
+            for col_id in range(0, xls_sheet.ncols):
+                row_dict[colnames[col_id]] = xls_sheet.cell(row_id, col_id).value
+            # should check for duplicate alias/ids?
+            sheet_contents_dict[row_id] = row_dict
+        content_dict[sheet_name] = sheet_contents_dict
+    yaml.dump(content_dict, sys.stdout)
+    print('YAML -------------')
+
+
 parser = argparse.ArgumentParser()
 parser.add_argument('--form', dest='xlsx_path', required=True)
 parser.add_argument('--out_dir', dest='out_path', required=True)
 parser.add_argument('--action', dest='action', required=True)
 parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
+parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
 args = parser.parse_args()
 
 xl_workbook = xlrd.open_workbook(args.xlsx_path)
@@ -77,9 +107,11 @@
 xl_sheet = xl_workbook.sheet_by_name('ENA_experiment')
 if xl_sheet.nrows < 3:
     raise ValueError('No experiments found in experiments sheet')
-exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description', 'library_name',
-               'library_strategy', 'library_source', 'library_selection', 'library_layout',
-               'insert_size', 'library_construction_protocol', 'platform', 'instrument_model']
+exp_columns = ['alias', 'title', 'study_alias', 'sample_alias', 'design_description',
+               'library_name', 'library_strategy', 'library_source', 'library_selection',
+               'library_layout', 'insert_size', 'library_construction_protocol',
+               'platform', 'instrument_model']
+
 experiments_dict = extract_data(xl_sheet, exp_columns)
 
 # PARSE RUNS SHEET
@@ -123,6 +155,8 @@
 # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
 # dt_oobj = datetime.now(tz=None)
 # timestamp = dt_oobj.strftime("%Y%m%d_%H:%M:%S")
+runs_included = []
+exp_included = []
 for study_alias, study in studies_dict.items():
     # study_alias = study_alias + '_' + timestamp
     studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
@@ -162,12 +196,37 @@
                                                exp['library_construction_protocol'],
                                                exp['platform'], exp['instrument_model'],
                                                'submission_date_ENA']) + '\n')
+            exp_included.append(exp_alias)
             for run_alias, run in runs_dict.items():
-                if run['experiment_alias'] == exp_alias:
-                    runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', exp_alias,
-                                                run['file_name'], FILE_FORMAT, 'file_checksum',
-                                                'submission_date_ENA']) + '\n')
+                # check that the experiments library_layout is set to paired
+                # when multiple entries are associated with the same run alias
+                if not isinstance(run, list):
+                    runs_list = [run]
+                else:
+                    runs_list = run
+                for run_entry in runs_list:
+                    if run_entry['experiment_alias'] == exp_alias:
+                        runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
+                                                    exp_alias, run_entry['file_name'],
+                                                    FILE_FORMAT, 'file_checksum',
+                                                    'submission_date_ENA']) + '\n')
+                runs_included.append(run_alias)
+
+# check if any experiment or run was not associated with any sample
+for run in runs_dict.keys():
+    if run not in runs_included:
+        print(f'The run {run} is listed in the runs section but not associated with any \
+              used experiment')
+
+for exp in experiments_dict.keys():
+    if exp not in exp_included:
+        print(f'The experiment {exp} is listed in the experiments section but not associated \
+              with any used sample')
+
 studies_table.close()
 samples_table.close()
 experiments_table.close()
 runs_table.close()
+
+if args.verbose:
+    paste_xls2yaml(args.xlsx_path)
b
diff -r 382518f24d6d -r 57251c760cab samples_macros.xml
--- a/samples_macros.xml Sat Nov 28 09:45:44 2020 +0000
+++ b/samples_macros.xml Fri Apr 30 12:09:25 2021 +0000
[
b'@@ -1,5 +1,24 @@\n <macros>\n-\n+    <xml name="test_submit_section">\n+        <section name="test_submit_parameters" expanded="true" title="Testing options">\n+            <param name="submit_dev" type="boolean" truevalue="true" falsevalue="false" label="Submit to test ENA server?" help="By selecting yes the reads will be submitted to the ENA test server. Uploads to test platform will not be public and will be removed in 24hrs. Performing a preliminary test upload is advised to check for errors with metadata structure. You can find the uploads to the test platform at https://wwwdev.ebi.ac.uk/ena/" />\n+            <param name="dry_run" type="boolean" truevalue="true" falsevalue="false" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>\n+        </section>\n+    </xml>\n+    <xml name="run_inputs_macro">\n+        <conditional name="run_input_format_conditional">\n+            <param name="run_input_format" type="select" label="Select runs input format">\n+                <option value="multiple_selection_list" selected="True">Select individual datasets or datasets collection</option>\n+                <option value="paired_list" selected="False">Input from a paired collection</option>\n+            </param>\n+            <when value="multiple_selection_list">\n+                <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run\'s files names defined in the metadata"/>\n+            </when>\n+            <when value="paired_list">\n+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run\'s files names defined in the metadata" />\n+            </when>\n+        </conditional>\n+    </xml>\n     <xml name="table_inputs_macro">\n         <conditional name="input_format_conditional">\n             <param name="input_format" type="select" label="Would you like to submit pregenerated table files or interactively define the input structures?">\n@@ -9,22 +28,20 @@\n             </param>\n             <when value="excel_tables">\n                 <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />\n-                <param name="dry_run" type="boolean" label="Print the tables but do not submit the datasets" help="If yes is selected then NO submission will be performed."/>\n-                <param name="xlsx_file" type="data" format="xlsx" />\n-                <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>\n+                <param name="xlsx_file" type="data" format="xlsx" label="Select Excel (xlsx) file based on templates" />\n+                <expand macro="run_inputs_macro" />\n             </when>\n             <when value="user_generated_tables">\n                 <param name="viral_submission" type="boolean" label="Does your submission data belong to a viral sample?" help="If you select yes then your data will be submitted using the ENA virus pathogen reporting standard checklist (see: https://ena-browser-docs.readthedocs.io/en/latest/help_and_guides/sars-cov-2-submissions.html)" />\n-                <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select all datasets to upload" help="Compressed reads files listed in the runs table"/>\n+                <expand macro="run_inp'..b'r submission contains viral samples?" />\n                     <when value="true">\n                         <expand macro="viral_samples" />\n                     </when>\n@@ -51,8 +68,11 @@\n                 <param name="sample_description" type="text" help="e.g: liver cells" label="Describe the type of sample"/>\n                 <param name="scientific_name" type="text" label="Enter the species of the sample" help="e.g Severe acute respiratory syndrome coronavirus 2"/>\n                 <param name="tax_id" type="text" label="Enter the taxonomic ID corresponding to the sample species" />\n-                <param name="collection_date" type="text" label="Collection date" optional="True" help="options are: YYYY, YYYY/MM, YYYY/MM/DD, not collected, restricted access or leave blank">\n-                    <validator type="regex"  message="Data format is not valid">(^[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not colected$)|(^not provided$)|(^restricted access$)</validator>\n+                <param name="collection_date" type="text" label="Collection date" optional="True" help="options are: YYYY, YYYY-MM, YYYY-MM-DD, not collected, restricted access or not provided">\n+                    <option value="not collected">not collected</option>\n+                    <option value="restricted access">restricted access</option>\n+                    <option value="not provided">not provided</option>\n+                    <validator type="regex"  message="Data format is not valid">(^[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?(/[0-9]{4}(-[0-9]{2}(-[0-9]{2}(T[0-9]{2}:[0-9]{2}(:[0-9]{2})?Z?([+-][0-9]{1,2})?)?)?)?)?$)|(^not collected$)|(^not provided$)|(^restricted access$)</validator>\n                 </param>\n                 <param name="geo_location_country" type="select" label="Select the country where the sample was obtained">\n                     <options from_data_table="geographic_location_1">\n@@ -118,6 +138,7 @@\n                         </options>\n                     </param>\n                     <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >\n+                        <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>\n                         <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>\n                     </repeat>\n                 </repeat>\n@@ -166,7 +187,7 @@\n                 <param name="library_construction_protocol" type="text" label="Please describe the library construction protocol"/>\n                 <param name="platform" type="select" label="Select the sequencing platform used">\n                     <option value="LS454">LS454</option>\n-                    <option value="ILLUMINA">Illumina</option>\n+                    <option value="ILLUMINA" selected="True">Illumina</option>\n                     <option value="HELICOS">Helicos</option>\n                     <option value="ABI_SOLID">ABI Solid</option>\n                     <option value="COMPLETE_GENOMICS">Complete Genomics</option>\n@@ -182,6 +203,7 @@\n                     </options>\n                 </param>\n                 <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >\n+                    <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>\n                     <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>\n                 </repeat>\n             </repeat>\n'
b
diff -r 382518f24d6d -r 57251c760cab test-data/2.fastqsanger.gz
b
Binary file test-data/2.fastqsanger.gz has changed