Mercurial > repos > iuc > ena_upload
changeset 2:9e2df763086c draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit 1eed23745846ce215e9bdc4a4934d6bc8f41b24e"
author | iuc |
---|---|
date | Thu, 15 Jul 2021 20:12:34 +0000 (2021-07-15) |
parents | 57251c760cab |
children | 59bb6d34fca6 |
files | ena_upload.xml mappings.py process_xlsx.py samples_macros.xml test-data/metadata_test_viral_optional_columns.xlsx |
diffstat | 5 files changed, 167 insertions(+), 47 deletions(-) [+] |
line wrap: on
line diff
--- a/ena_upload.xml Fri Apr 30 12:09:25 2021 +0000 +++ b/ena_upload.xml Thu Jul 15 20:12:34 2021 +0000 @@ -1,6 +1,6 @@ -<tool id="ena_upload" name="ENA Upload tool" version="0.3.1" profile="20.01" license="MIT"> +<tool id="ena_upload" name="ENA Upload tool" version="0.3.2" profile="20.01" license="MIT"> <macros> - <token name="@VERSION@">0.2.7</token> + <token name="@VERSION@">0.3.1</token> <import>samples_macros.xml</import> </macros> <requirements> @@ -48,7 +48,6 @@ ln -s '$action_options.input_format_conditional.samples_users_table' $samples_table_path && #end if - #if $action_options.test_submit_parameters.dry_run == "false" and $action_options.test_submit == "False": webin_id=`grep 'username' $credentials`; if [ "\$webin_id" = "" ]; then @@ -292,7 +291,58 @@ <assert_contents> <has_n_lines n="5"/> <has_n_columns n="18"/> - <has_line_matching expression="alias\tstatus\taccession\ttitle\tscientific_name\ttaxon_id\tsample_description\tcollection_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tsubmission_date"/> + <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date"/> + </assert_contents> + </output> + <output name="runs_table_out"> + <assert_contents> + <has_n_lines n="5"/> + <has_n_columns n="8"/> + <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/> + <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/> + </assert_contents> + </output> + </test> + <!--Test excel input of VIRAL samples with extended columns--> + <test> + <conditional name="action_options"> + <param name="action" value="add"/> + <section name="test_submit_parameters"> + <param name="submit_dev" value="false" /> + <param name="dry_run" value="true" /> + </section> + <conditional name="input_format_conditional"> + <param name="input_format" value="excel_tables"/> + <param name="viral_submission" value="True"/> + <param name="xlsx_file" value="metadata_test_viral_optional_columns.xlsx"/> + <conditional name="run_input_format_conditional"> + <param name="run_input_format" value="multiple_selection_list"/> + <param name="data" value="sample.fq"/> + </conditional> + </conditional> + </conditional> + <param name="center" value="Some research center"/> + <output name="experiments_table_out"> + <assert_contents> + <has_n_lines n="5"/> + <has_n_columns n="17"/> + <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" /> + <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/> + </assert_contents> + </output> + <output name="studies_table_out"> + <assert_contents> + <has_n_lines n="2"/> + <has_n_columns n="8"/> + <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/> + <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/> + </assert_contents> + </output> + <output name="samples_table_out"> + <assert_contents> + <has_n_lines n="5"/> + <has_n_columns n="42"/> + <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date\tgeographic_location_latitude\tgeographic_location_longitude\tsample_capture_status\thost_disease_outcome\thost_age\tvirus_identifier\treceipt_date\tdefinition_for_seropositive_sample\tserotype\thost_habitat\tisolation_source_host_associated\thost_behaviour\tisolation_source_non_host_associated\tsubject_exposure\tsubject_exposure_duration\ttype_exposure\tpersonal_protective_equipment\thospitalisation\tillness_duration\tillness_symptoms\tsample_storage_conditions\tstrain\thost_description\tgravidity"/> </assert_contents> </output> <output name="runs_table_out"> @@ -341,7 +391,7 @@ <assert_contents> <has_n_lines n="5"/> <has_n_columns n="8"/> - <has_line_matching expression="alias\tstatus\taccession\ttitle\tscientific_name\ttaxon_id\tsample_description\tsubmission_date"/> + <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date"/> </assert_contents> </output> <output name="runs_table_out"> @@ -628,7 +678,7 @@ This is a wrapper for the ENA upload tool in https://github.com/usegalaxy-eu/ena-upload-cli The input metadata can be submitted following the tabular format of the templates in https://github.com/usegalaxy-eu/ena-upload-cli/tree/master/example_tables It is also possible to submit an excel file by following the template in https://drive.google.com/file/d/1ncC22--tW2v-EI-te_r86sAZujIPAjlX/view?usp=sharing - For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1U4VdcczsIecIXxseV8svE1zO_CBUadog/view?usp=sharing + For viral submissions a larger set of metadata is required, you can find the template in https://drive.google.com/file/d/1Gx78GKh58PmRjdmJ05DBbpObAL-3oUFX/view?usp=sharing ]]></help> <citations> </citations>
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mappings.py Thu Jul 15 20:12:34 2021 +0000 @@ -0,0 +1,29 @@ + +optional_samples_cols_mapping = { + "collection date": "collection_date", + "geographic location (latitude)": "geographic_location_latitude", + "geographic location (longitude)": "geographic_location_longitude", + "geographic location (region)": "geographic_location_region", + "sample capture status": "sample_capture_status", + "host disease outcome": "host_disease_outcome", + "host_age": "host_age", + "virus identifier": "virus_identifier", + "receipt date": "receipt_date", + "definition for seropositive sample": "definition_for_seropositive_sample", + "serotype (required for a seropositive sample)": "serotype", + "host habitat": "host_habitat", + "isolation source host-associated": "isolation_source_host_associated", + "host behaviour": "host_behaviour", + "isolation source non-host-associated": "isolation_source_non_host_associated", + "subject exposure": "subject_exposure", + "subject exposure duration": "subject_exposure_duration", + "type exposure": "type_exposure", + "personal protective equipment": "personal_protective_equipment", + "hospitalisation": "hospitalisation", + "illness duration": "illness_duration", + "illness symptoms": "illness_symptoms", + "sample storage conditions": "sample_storage_conditions", + "strain": "strain", + "host description": "host_description", + "gravidity": "gravidity" +}
--- a/process_xlsx.py Fri Apr 30 12:09:25 2021 +0000 +++ b/process_xlsx.py Thu Jul 15 20:12:34 2021 +0000 @@ -4,23 +4,35 @@ import xlrd import yaml +from mappings import optional_samples_cols_mapping FILE_FORMAT = 'fastq' -def extract_data(xl_sheet, expected_columns): +def extract_data(xl_sheet, expected_columns, optional_cols=None): """ 1. Check that the columns I expect are present in the sheet (any order and mixed with others, it's just a verification that the user filled the correct template) 2. Fill a dictionary with the rows data indexed by first column in list""" sheet_columns = {} + if optional_cols is None: + optional_cols = [] + optional_cols_loaded = [] for sh_col in range(xl_sheet.ncols): - if xl_sheet.cell(0, sh_col).value in expected_columns: + if (xl_sheet.cell(0, sh_col).value in expected_columns) \ + or (xl_sheet.cell(0, sh_col).value in optional_cols): if xl_sheet.cell(0, sh_col).value in sheet_columns.keys(): - sys.exit("Duplicated columns") + sys.exit("Duplicated columns found") else: sheet_columns[xl_sheet.cell(0, sh_col).value] = sh_col + if xl_sheet.cell(0, sh_col).value in optional_cols: + # store the list of optional cols available + optional_cols_loaded.append(xl_sheet.cell(0, sh_col).value) + provided_cols = expected_columns + optional_cols_loaded + + # check that the required columns are all present + # TODO: revise this for optional columns for col in range(len(expected_columns)): assert expected_columns[col] in sheet_columns.keys(), \ "Expected column %s not found" % expected_columns[col] @@ -32,9 +44,9 @@ # skip first 2 rows: column names + comments rows for row_id in range(2, xl_sheet.nrows): row_dict = {} - for col in range(1, len(expected_columns)): - sheet_col_index = sheet_columns[expected_columns[col]] - row_dict[expected_columns[col]] = xl_sheet.cell(row_id, sheet_col_index).value + for col in range(1, len(provided_cols)): + sheet_col_index = sheet_columns[provided_cols[col]] + row_dict[provided_cols[col]] = xl_sheet.cell(row_id, sheet_col_index).value # should check for duplicate alias/ids? if xl_sheet.cell(row_id, index_col).value in data_dict.keys(): tmp = data_dict[xl_sheet.cell(row_id, index_col).value] @@ -42,7 +54,7 @@ data_dict[xl_sheet.cell(row_id, index_col).value].append(row_dict) else: data_dict[xl_sheet.cell(row_id, index_col).value] = row_dict - return data_dict + return data_dict, optional_cols_loaded def paste_xls2yaml(xlsx_path): @@ -86,22 +98,25 @@ raise ValueError('No entries found in studies sheet') studies_dict = {} studies_col = ['alias', 'title', 'study_type', 'study_abstract'] -studies_dict = extract_data(xl_sheet, studies_col) +studies_dict, _ = extract_data(xl_sheet, studies_col) # PARSE SAMPLES ################# xl_sheet = xl_workbook.sheet_by_name('ENA_sample') if xl_sheet.nrows < 3: raise ValueError('No entries found in samples') + +samples_cols_excel = ['alias', 'title', 'scientific_name', 'sample_description'] +# optional_samples_cols_mapping = {} if args.viral_submission: - samples_cols = ['alias', 'title', 'scientific_name', 'sample_description', - 'geographic location (country and/or sea)', 'host common name', - 'host health state', 'host sex', 'host scientific name', 'collector name', - 'collection date', 'collecting institution', 'isolate'] -else: - samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] -samples_dict = extract_data(xl_sheet, samples_cols) + # load columns names from the table + samples_cols_excel = samples_cols_excel + ['geographic location (country and/or sea)', + 'host common name', 'host health state', + 'host sex', 'host scientific name', 'collector name', + 'collecting institution', 'isolate'] +samples_dict, samples_optional_cols_loaded = extract_data(xl_sheet, samples_cols_excel, + optional_samples_cols_mapping.keys()) # PARSE EXPERIMENTS ################# xl_sheet = xl_workbook.sheet_by_name('ENA_experiment') @@ -112,7 +127,7 @@ 'library_layout', 'insert_size', 'library_construction_protocol', 'platform', 'instrument_model'] -experiments_dict = extract_data(xl_sheet, exp_columns) +experiments_dict, _ = extract_data(xl_sheet, exp_columns) # PARSE RUNS SHEET ################# @@ -120,23 +135,27 @@ if xl_sheet.nrows < 3: raise ValueError('No entries found in runs sheet') run_cols = ['alias', 'experiment_alias', 'file_name', 'file_format'] -runs_dict = extract_data(xl_sheet, run_cols) +runs_dict, _ = extract_data(xl_sheet, run_cols) # WRITE HEADERS TO TABLES studies_table = open(pathlib.Path(args.out_path) / 'studies.tsv', 'w') studies_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_type', 'study_abstract', 'pubmed_id', 'submission_date']) + '\n') samples_table = open(pathlib.Path(args.out_path) / 'samples.tsv', 'w') + +samples_cols = ['alias', 'title', 'scientific_name', 'sample_description'] +# extend the samples_cols list to add the ones that are filled by the CLI +samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] if args.viral_submission: - samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', - 'taxon_id', 'sample_description', 'collection_date', - 'geographic_location', 'host_common_name', 'host_subject_id', - 'host_health_state', 'host_sex', 'host_scientific_name', - 'collector_name', 'collecting_institution', 'isolate', - 'submission_date']) + '\n') -else: - samples_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'scientific_name', - 'taxon_id', 'sample_description', 'submission_date']) + '\n') + # extend the samples columns with the viral specific data + samples_cols = samples_cols + ['geographic_location', 'host_common_name', + 'host_subject_id', 'host_health_state', 'host_sex', + 'host_scientific_name', 'collector_name', + 'collecting_institution', 'isolate'] + if len(samples_optional_cols_loaded) > 0: + for optional_cols_excel in samples_optional_cols_loaded: + samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) +samples_table.write('\t'.join(samples_cols) + '\n') experiments_table = open(pathlib.Path(args.out_path) / 'experiments.tsv', 'w') experiments_table.write('\t'.join(['alias', 'status', 'accession', 'title', 'study_alias', @@ -164,22 +183,44 @@ 'ENA_submission_data']) + '\n') # assuming no pubmed_id for sample_alias, sample in samples_dict.items(): # sample_alias = sample_alias + '_' + timestamp + samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], + sample['sample_description'], action, 'ena_accession', + 'tax_id_updated_by_ENA', 'ENA_submission_date'] if args.viral_submission: + # add the values that are unique for the viral samples if sample['collector name'] == '': sample['collector name'] = 'unknown' - samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], - sample['scientific_name'], 'tax_id_updated_by_ENA', - sample['sample_description'], sample['collection date'], - sample['geographic location (country and/or sea)'], - sample['host common name'], 'host subject id', - sample['host health state'], sample['host sex'], - sample['host scientific name'], sample['collector name'], - sample['collecting institution'], sample['isolate'], - 'ENA_submission_date']) + '\n') - else: - samples_table.write('\t'.join([sample_alias, action, 'ena_accession', sample['title'], - sample['scientific_name'], 'tax_id_updated_by_ENA', - sample['sample_description']]) + '\n') + samples_row_values = samples_row_values + \ + [sample['geographic location (country and/or sea)'], sample['host common name'], + 'host subject id', sample['host health state'], sample['host sex'], + sample['host scientific name'], sample['collector name'], + sample['collecting institution'], sample['isolate']] + # add the (possible) optional columns values + if len(samples_optional_cols_loaded) > 0: + for optional_col in samples_optional_cols_loaded: + # parse values stored as in excel date format (=float) + if optional_col in ('collection date', 'receipt date'): + # check if excel stored it as date + if isinstance(sample[optional_col], float): + year, month, day, hour, minute, second = xlrd.xldate_as_tuple( + sample[optional_col], xl_workbook.datemode) + month = "{:02d}".format(month) + day = "{:02d}".format(day) + hour = "{:02d}".format(hour) + minute = "{:02d}".format(minute) + second = "{:02d}".format(second) + # format it as 2008-01-23T19:23:10 + sample[optional_col] = str(year) + '-' + str(month) + '-' + str(day) + \ + 'T' + str(hour) + ':' + str(minute) + ':' + str(second) + # excel stores everything as float so I need to check if + # the value was actually an int and keep it as int + if isinstance(sample[optional_col], float): + if int(sample[optional_col]) == sample[optional_col]: + # it is not really a float but an int + sample[optional_col] = int(sample[optional_col]) + samples_row_values.append(str(sample[optional_col])) + samples_table.write('\t'.join(samples_row_values) + '\n') + for exp_alias, exp in experiments_dict.items(): # should I check here if any experiment has a study or sample alias that is incorrect? # (not listed in the samples or study dict)
--- a/samples_macros.xml Fri Apr 30 12:09:25 2021 +0000 +++ b/samples_macros.xml Thu Jul 15 20:12:34 2021 +0000 @@ -138,7 +138,7 @@ </options> </param> <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > - <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> + <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> </repeat> </repeat> @@ -203,7 +203,7 @@ </options> </param> <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > - <param name="run_base_name" type="text" optional="False" default="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> + <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> </repeat> </repeat>