Galaxy |

Changeset 4:26ccb678abc8 (2021-10-19)

Previous changeset 3:59bb6d34fca6 (2021-08-18) Next changeset 5:e1b3b37aa69f (2021-10-27)

Commit message:
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"

modified:
ena_upload.xml
process_xlsx.py
samples_macros.xml

added:
check_remote.py

diff -r 59bb6d34fca6 -r 26ccb678abc8 check_remote.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/check_remote.py Tue Oct 19 15:57:14 2021 +0000

[

@@ -0,0 +1,23 @@
+import json
+
+import requests
+
+URL = "https://www.ebi.ac.uk/ena/portal/api/search"
+
+
+def check_remote_entry(entry_type, query_dict, out_format='json'):
+    '''
+    Checks if an entry with that alias exists in the ENA repos
+    entry_type = [study | sample | experiment | run]
+    '''
+    assert entry_type in ['study', 'sample', 'experiment', 'run']
+    params_dict = {}
+    query_str = ' AND '.join(['%s=%s' % (key, value) for (key, value) in query_dict.items()])
+    params_dict['query'] = query_str
+    params_dict['result'] = 'read_' + entry_type
+    params_dict['fields'] = entry_type + '_alias'
+    params_dict['format'] = out_format
+    response = requests.post(URL, data=params_dict)
+    if response.content != b'':
+        return json.loads(response.content)
+    return []

diff -r 59bb6d34fca6 -r 26ccb678abc8 ena_upload.xml
--- a/ena_upload.xml Wed Aug 18 19:42:49 2021 +0000
+++ b/ena_upload.xml Tue Oct 19 15:57:14 2021 +0000

[

b'@@ -1,6 +1,6 @@\n-<tool id="ena_upload" name="ENA Upload tool" version="0.3.3" profile="20.01" license="MIT">\n+<tool id="ena_upload" name="ENA Upload tool" version="@VERSION@" profile="20.01" license="MIT">\n <macros>\n- <token name="@VERSION@">0.3.1</token>\n+ <token name="@VERSION@">0.4.1</token>\n <import>samples_macros.xml</import>\n </macros>\n <requirements>\n@@ -33,6 +33,9 @@\n #if $action_options.input_format_conditional.viral_submission == "true":\n --vir \n #end if\n+ #if $action_options.test_submit_parameters.submit_dev == "true":\n+ --dev\n+ #end if\n --action \'$action_options.action\' --form \'$action_options.input_format_conditional.xlsx_file\' --out_dir ./submission_files --verbose > \'$output\';\n #end if\n \n@@ -84,7 +87,26 @@\n #for $run in $experiment.rep_runs:\n #for $file in $run.upload_files:\n #set $safename_reads_file = re.sub(\'[^\\w\\-_\\.]\', \'_\', $file.element_identifier)\n- ln -s \'$file\' $safename_reads_file &&\n+ #if $action_options.input_format_conditional.add_extension == "true":\n+ #set $extension = \'.fastq\'\n+ #else\n+ #set $extension = \'\'\n+ #end if\n+ #if $file.is_of_type(\'fastq\', \'fastqsanger\'):\n+ ## compression output is defined as safename_reads_file so no need to symlink\n+ #set $safename_reads_file = $safename_reads_file + $extension + \'.gz\'\n+ gzip -c \'$file\' > $safename_reads_file &&\n+ #else:\n+ #if $action_options.input_format_conditional.add_extension == "true":\n+ #if $file.is_of_type(\'fastq.gz\', \'fastqsanger.gz\'):\n+ #set $compression = \'.gz\' \n+ #elif $file.is_of_type(\'fastqsanger.bz2\', \'fastq.bz2\'):\n+ #set $compression = \'.bz2\' \n+ #end if\n+ #set $safename_reads_file = $safename_reads_file + $extension + $compression \n+ #end if\n+ ln -s \'$file\' $safename_reads_file &&\n+ #end if\n $files_to_upload.append(str($safename_reads_file))\n #end for\n #end for\n@@ -95,31 +117,61 @@\n #if $action_options.input_format_conditional.run_input_format_conditional.run_input_format == \'paired_list\':\n #for $pair in $action_options.input_format_conditional.run_input_format_conditional.paired_end_collection:\n #set $safename_reads_file = re.sub(\'[^\\w\\-_\\.]\', \'_\', $pair.name)\n- #if $pair.forward.is_of_type(\'fastq.gz\', \'fastqsanger.gz\'):\n- #set $safename_fwd_reads_file = $safename_reads_file + \'_1.fastq.gz\'\n- #elif $pair.forward.is_of_type(\'fastqsanger.bz2\', \'fastq.bz2\'):\n- #set $safename_fwd_reads_file = $safename_reads_file + \'_1.fastq.bz2\'\n- #else:\n- #set $safename_fwd_reads_file = $safename_reads_file + \'_1.fastq\'\n- #end if\n- #if $pair.reverse.is_of_type(\'fastq.gz\', \'fastqsanger.gz\'):\n- #set $safename_rev_reads_file = $safename_reads_file + \'_2.fastq.gz\'\n- #elif $pair.reverse.is_of_type(\'fastqsanger.bz2\', \'fastq.bz2\'):\n- #set $safename_rev_reads_file = $safename_reads_file + \'_2.fastq.bz2\'\n- #else:\n- #set $safename_rev_reads_file = $safename_reads_file + \'_2.fastq\'\n+ ## Always need to add .fastq + compression suffix because the name is based on the pair name which has no extensions\n+ #if $pair.forward.is_of_type(\'fastq\', \'fastqsanger\'):\n+ ## compress the file, no need to create the link then\n+ ## always add the compression suffix (.gz)\n+ #set $safename_fwd_reads_file = $safename_reads_file + \'_1\' + \'fastq\' + \'.gz\'\n+ gzip -c \'$file\' > $safename_'..b'+ <param name="library_layout" value="SINGLE"/>\n+ <param name="insert_size" value="150"/>\n+ <param name="library_construction_protocol" value="Test library construction"/>\n+ <param name="platform" value="ILLUMINA"/>\n+ <param name="instrument_model" value="Illumina HiSeq 4000"/>\n+ <repeat name="rep_runs">\n+ <param name="run_base_name" value="run_from_hospital_X"/>\n+ <param name="upload_files" value="1.fastqsanger.gz,2.fastqsanger.gz" ftype="fastqsanger.gz"/>\n+ </repeat>\n+ </repeat>\n+ </repeat>\n+ </repeat>\n+ </conditional>\n+ </conditional>\n+ </conditional>\n+ <param name="center" value="Some research center"/>\n+ <assert_command>\n+ <has_text_matching expression="ena-upload-cli"/>\n+ <has_text_matching expression="--data \'1.fastqsanger.gz\' \'2.fastqsanger.gz\'"/>\n+ <has_text_matching expression="--action \'add\' --center \'Some research center\'"/>\n+ <has_text_matching expression="--checklist ERC000033"/>\n+ </assert_command>\n+ <assert_stderr>\n+ <has_text_matching expression="Oops, the file test_fake_path does not exist"/>\n+ </assert_stderr>\n+ </test>\n+ \n+ <test expect_failure="true">\n+ <conditional name="action_options">\n+ <param name="action" value="modify"/>\n+ <section name="test_submit_parameters">\n+ <param name="submit_dev" value="false" />\n+ <param name="dry_run" value="false" />\n+ </section>\n+ <param name="test_submit" value="True"/>\n+ <conditional name="input_format_conditional">\n+ <param name="add_extension" value="False"/>\n <param name="input_format" value="build_tables"/>\n <conditional name="conditional_viral_metadata">\n <param name="viral_sample" value="True"/>\n@@ -654,7 +817,7 @@\n <param name="instrument_model" value="Illumina HiSeq 4000"/>\n <repeat name="rep_runs">\n <param name="run_base_name" value="run_from_hospital_X"/>\n- <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/>\n+ <param name="upload_files" value="sample.fq" ftype="fastqsanger"/>\n </repeat>\n </repeat>\n </repeat>\n@@ -665,9 +828,10 @@\n <param name="center" value="Some research center"/>\n <assert_command>\n <has_text_matching expression="ena-upload-cli"/>\n- <has_text_matching expression="--data \'1.fastqsanger.gz\' \'sample.fq\'"/>\n- <has_text_matching expression="--action \'add\' --center \'Some research center\'"/>\n- <has_text_matching expression="--vir"/>\n+ <has_text_matching expression="--data \'sample.fq.gz\'"/>\n+ <has_text_matching expression="--action \'modify\' --center \'Some research center\'"/>\n+ <has_text_matching expression="--checklist ERC000033"/>\n+ <not_has_text text="add" />\n </assert_command>\n <assert_stderr>\n <has_text_matching expression="Oops, the file test_fake_path does not exist"/>\n'

diff -r 59bb6d34fca6 -r 26ccb678abc8 process_xlsx.py
--- a/process_xlsx.py Wed Aug 18 19:42:49 2021 +0000
+++ b/process_xlsx.py Tue Oct 19 15:57:14 2021 +0000

[

@@ -4,11 +4,24 @@

import xlrd
import yaml
+from check_remote import check_remote_entry
from mappings import optional_samples_cols_mapping

FILE_FORMAT = 'fastq'

+def identify_action(entry_type, alias):
+    ''' define action ['add' | 'modify'] that needs to be perfomed for this entry '''
+    query = {entry_type + '_alias': alias}
+    remote_accessions = check_remote_entry(entry_type, query)
+    if len(remote_accessions) > 0:
+        print(f'Found: {entry_type} entry with alias {alias}')
+        return 'modify'
+    else:
+        print(f'No {entry_type} entry found with alias {alias}')
+        return 'add'
+
+
def extract_data(xl_sheet, expected_columns, optional_cols=None):
     """
     1. Check that the columns I expect are present in the sheet
@@ -86,6 +99,7 @@
parser.add_argument('--out_dir', dest='out_path', required=True)
parser.add_argument('--action', dest='action', required=True)
parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true')
+parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true')
parser.add_argument('--verbose', dest='verbose', required=False, action='store_true')
args = parser.parse_args()

@@ -148,10 +162,10 @@
samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date']
if args.viral_submission:
     # extend the samples columns with the viral specific data
-    samples_cols = samples_cols + ['geographic_location', 'host_common_name',
-                                   'host_subject_id', 'host_health_state', 'host_sex',
-                                   'host_scientific_name', 'collector_name',
-                                   'collecting_institution', 'isolate']
+    samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name',
+                                   'host subject id', 'host health state', 'host sex',
+                                   'host scientific name', 'collector name',
+                                   'collecting institution', 'isolate']
     if len(samples_optional_cols_loaded) > 0:
         for optional_cols_excel in samples_optional_cols_loaded:
             samples_cols.append(optional_samples_cols_mapping[optional_cols_excel])
@@ -168,7 +182,7 @@
runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name',
                             'file_format', 'file_checksum', 'submission_date']) + '\n')
action = args.action
-
+# actionable_items
# WRITE  DICTIONARIES TO TABLE FILES

# ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS?
@@ -178,14 +192,22 @@
exp_included = []
for study_alias, study in studies_dict.items():
     # study_alias = study_alias + '_' + timestamp
-    studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'],
+    if args.dev_submission:
+        entry_action = args.action
+    else:
+        entry_action = identify_action('study', study_alias)
+    studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'],
                                    study['study_type'], study['study_abstract'], '',
                                    'ENA_submission_data']) + '\n')  # assuming no pubmed_id
for sample_alias, sample in samples_dict.items():
     # sample_alias = sample_alias + '_' + timestamp
+    if args.dev_submission:
+        entry_action = args.action
+    else:
+        entry_action = identify_action('sample', sample_alias)
     samples_row_values = [sample_alias, sample['title'], sample['scientific_name'],
-                          sample['sample_description'], action, 'ena_accession',
-                          'tax_id_updated_by_ENA', 'ENA_submission_date']
+                          sample['sample_description'], entry_action, 'ena_accession',
+                          '', 'ENA_submission_date']
     if args.viral_submission:
         # add the values that are unique for the viral samples
         if sample['collector name'] == '':
@@ -230,7 +252,12 @@
         # (not listed in the samples or study dict)
         # process the experiments for this sample
         if exp['sample_alias'] == sample_alias:
-            experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'],
+            # check the remote status
+            if args.dev_submission:
+                entry_action = args.action
+            else:
+                entry_action = identify_action('experiment', exp_alias)
+            experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'],
                                                exp['study_alias'], sample_alias,
                                                exp['design_description'], exp['library_name'],
                                                exp['library_strategy'], exp['library_source'],
@@ -250,9 +277,13 @@
                     runs_list = run
                 for run_entry in runs_list:
                     if run_entry['experiment_alias'] == exp_alias:
-                        runs_table.write('\t'.join([run_alias, action, 'ena_run_accession',
+                        if args.dev_submission:
+                            entry_action = args.action
+                        else:
+                            entry_action = identify_action('run', run_alias)
+                        runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession',
                                                     exp_alias, run_entry['file_name'],
-                                                    FILE_FORMAT, 'file_checksum',
+                                                    FILE_FORMAT, '',
                                                     'submission_date_ENA']) + '\n')
                 runs_included.append(run_alias)

diff -r 59bb6d34fca6 -r 26ccb678abc8 samples_macros.xml
--- a/samples_macros.xml Wed Aug 18 19:42:49 2021 +0000
+++ b/samples_macros.xml Tue Oct 19 15:57:14 2021 +0000

@@ -12,10 +12,11 @@
                 <option value="paired_list" selected="False">Input from a paired collection</option>
             </param>
             <when value="multiple_selection_list">
-                <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/>
+                <param name="add_extension" type="boolean" checked="False" label="Add .fastq.(gz,.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/>
+                <param name="data" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/>
             </when>
             <when value="paired_list">
-                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" />
+                <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" />
             </when>
         </conditional>
     </xml>
@@ -40,6 +41,7 @@
                 <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/>
             </when>
             <when value="build_tables">
+                <param name="add_extension" type="boolean" checked="false" label="Add .fastq.(gz.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/>
                 <conditional name="conditional_viral_metadata">
                     <param name="viral_sample" type="boolean" truevalue="true" falsevalue="false" label="Does your submission contains viral samples?" />
                     <when value="true">
@@ -138,8 +140,8 @@
                         </options>
                     </param>
                     <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                        <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
-                        <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
+                        <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                        <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                     </repeat>
                 </repeat>
             </repeat>
@@ -203,8 +205,8 @@
                     </options>
                 </param>
                 <repeat name="rep_runs" title="Runs executed within this experiment" min="1" >
-                    <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
-                    <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
+                    <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/>
+                    <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/>
                 </repeat>
             </repeat>
         </repeat>