Mercurial > repos > iuc > ena_upload
changeset 4:26ccb678abc8 draft
"planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/master/tools/ena_upload commit ba358013c83e7dfffec895946d36585f237e54c5"
author | iuc |
---|---|
date | Tue, 19 Oct 2021 15:57:14 +0000 (2021-10-19) |
parents | 59bb6d34fca6 |
children | e1b3b37aa69f |
files | check_remote.py ena_upload.xml process_xlsx.py samples_macros.xml |
diffstat | 4 files changed, 282 insertions(+), 62 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/check_remote.py Tue Oct 19 15:57:14 2021 +0000 @@ -0,0 +1,23 @@ +import json + +import requests + +URL = "https://www.ebi.ac.uk/ena/portal/api/search" + + +def check_remote_entry(entry_type, query_dict, out_format='json'): + ''' + Checks if an entry with that alias exists in the ENA repos + entry_type = [study | sample | experiment | run] + ''' + assert entry_type in ['study', 'sample', 'experiment', 'run'] + params_dict = {} + query_str = ' AND '.join(['%s=%s' % (key, value) for (key, value) in query_dict.items()]) + params_dict['query'] = query_str + params_dict['result'] = 'read_' + entry_type + params_dict['fields'] = entry_type + '_alias' + params_dict['format'] = out_format + response = requests.post(URL, data=params_dict) + if response.content != b'': + return json.loads(response.content) + return []
--- a/ena_upload.xml Wed Aug 18 19:42:49 2021 +0000 +++ b/ena_upload.xml Tue Oct 19 15:57:14 2021 +0000 @@ -1,6 +1,6 @@ -<tool id="ena_upload" name="ENA Upload tool" version="0.3.3" profile="20.01" license="MIT"> +<tool id="ena_upload" name="ENA Upload tool" version="@VERSION@" profile="20.01" license="MIT"> <macros> - <token name="@VERSION@">0.3.1</token> + <token name="@VERSION@">0.4.1</token> <import>samples_macros.xml</import> </macros> <requirements> @@ -33,6 +33,9 @@ #if $action_options.input_format_conditional.viral_submission == "true": --vir #end if + #if $action_options.test_submit_parameters.submit_dev == "true": + --dev + #end if --action '$action_options.action' --form '$action_options.input_format_conditional.xlsx_file' --out_dir ./submission_files --verbose > '$output'; #end if @@ -84,7 +87,26 @@ #for $run in $experiment.rep_runs: #for $file in $run.upload_files: #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $file.element_identifier) - ln -s '$file' $safename_reads_file && + #if $action_options.input_format_conditional.add_extension == "true": + #set $extension = '.fastq' + #else + #set $extension = '' + #end if + #if $file.is_of_type('fastq', 'fastqsanger'): + ## compression output is defined as safename_reads_file so no need to symlink + #set $safename_reads_file = $safename_reads_file + $extension + '.gz' + gzip -c '$file' > $safename_reads_file && + #else: + #if $action_options.input_format_conditional.add_extension == "true": + #if $file.is_of_type('fastq.gz', 'fastqsanger.gz'): + #set $compression = '.gz' + #elif $file.is_of_type('fastqsanger.bz2', 'fastq.bz2'): + #set $compression = '.bz2' + #end if + #set $safename_reads_file = $safename_reads_file + $extension + $compression + #end if + ln -s '$file' $safename_reads_file && + #end if $files_to_upload.append(str($safename_reads_file)) #end for #end for @@ -95,31 +117,61 @@ #if $action_options.input_format_conditional.run_input_format_conditional.run_input_format == 'paired_list': #for $pair in $action_options.input_format_conditional.run_input_format_conditional.paired_end_collection: #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $pair.name) - #if $pair.forward.is_of_type('fastq.gz', 'fastqsanger.gz'): - #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq.gz' - #elif $pair.forward.is_of_type('fastqsanger.bz2', 'fastq.bz2'): - #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq.bz2' - #else: - #set $safename_fwd_reads_file = $safename_reads_file + '_1.fastq' - #end if - #if $pair.reverse.is_of_type('fastq.gz', 'fastqsanger.gz'): - #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq.gz' - #elif $pair.reverse.is_of_type('fastqsanger.bz2', 'fastq.bz2'): - #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq.bz2' - #else: - #set $safename_rev_reads_file = $safename_reads_file + '_2.fastq' + ## Always need to add .fastq + compression suffix because the name is based on the pair name which has no extensions + #if $pair.forward.is_of_type('fastq', 'fastqsanger'): + ## compress the file, no need to create the link then + ## always add the compression suffix (.gz) + #set $safename_fwd_reads_file = $safename_reads_file + '_1' + 'fastq' + '.gz' + gzip -c '$file' > $safename_fwd_reads_file && + #else + #if $pair.forward.is_of_type('fastq.gz', 'fastqsanger.gz'): + #set $compression = '.gz' + #elif $pair.forward.is_of_type('fastqsanger.bz2', 'fastq.bz2'): + #set $compression = '.bz2' + #end if + #set $safename_fwd_reads_file = $safename_reads_file + '_1' + '.fastq' + $compression + ln -s '$pair.forward' $safename_fwd_reads_file && #end if - ln -s '$pair.forward' $safename_fwd_reads_file && + #if $pair.reverse.is_of_type('fastq', 'fastqsanger'): + ## compress the file, no need to create the link then + #set $safename_reverse_reads_file = $safename_reads_file + '_1' + '.fastq' + '.gz' + gzip -c '$file' > $safename_rev_reads_file && + #else + #if $pair.reverse.is_of_type('fastqsanger.bz2', 'fastq.bz2'): + #set $compression = '.bz2' + #elif $pair.reverse.is_of_type('fastqsanger.gz', 'fastq.gz'): + #set $compression = '.gz' + #end if + #set $safename_rev_reads_file = $safename_reads_file + '_2' + '.fastq' + $compression + ln -s '$pair.reverse' $safename_rev_reads_file && + #end if $files_to_upload.append(str($safename_fwd_reads_file)) - ln -s '$pair.reverse' $safename_rev_reads_file && $files_to_upload.append(str($safename_rev_reads_file)) #end for #end if #if $action_options.input_format_conditional.run_input_format_conditional.run_input_format == 'multiple_selection_list': #for $file in $action_options.input_format_conditional.run_input_format_conditional.data: #set $safename_reads_file = re.sub('[^\w\-_\.]', '_', $file.element_identifier) - ln -s '$file' $safename_reads_file && + #if $file.is_of_type('fastq', 'fastqsanger'): + ## always compress add the gz extension + #if $action_options.input_format_conditional.run_input_format_conditional.add_extension == "true": + #set $safename_reads_file = $safename_reads_file + 'fastq.gz' + #else + #set $safename_reads_file = $safename_reads_file + '.gz' + #end if + gzip -c '$file' > $safename_reads_file && + #else + #if $action_options.input_format_conditional.run_input_format_conditional.add_extension == "true": + #if $file.is_of_type('fastq.gz', 'fastqsanger.gz'): + #set $extension = 'fastq.gz' + #elif $file.is_of_type('fastqsanger.bz2', 'fastq.bz2'): + #set $extension = 'fastq.bz2' + #end if + #set $safename_reads_file = $safename_reads_file + $extension + #end if + ln -s '$file' $safename_reads_file && + #end if $files_to_upload.append(str($safename_reads_file)) #end for #end if @@ -127,6 +179,7 @@ #if $action_options.test_submit_parameters.dry_run == "false": +#if $action_options.action == "add": ena-upload-cli --tool 'ena-upload-cli v@VERSION@ @ Galaxy' --action '$action_options.action' @@ -136,22 +189,23 @@ #for $dataset in $files_to_upload: '$dataset' #end for +--action add --experiment '$experiments_table_path' --study '$studies_table_path' --run '$runs_table_path' --sample '$samples_table_path' #if $action_options.input_format_conditional.input_format == "user_generated_tables": #if "$action_options.input_format_conditional.viral_submission" == "true": - --vir + --checklist ERC000033 #end if #else: #if $action_options.input_format_conditional.input_format == "build_tables": #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true": - --vir + --checklist ERC000033 #end if #else: #if $action_options.input_format_conditional.viral_submission == "true": - --vir + --checklist ERC000033 #end if #end if #end if @@ -160,6 +214,40 @@ -d #end if >> '$output'; +#end if + +#if $action_options.action == "modify": + ena-upload-cli + --tool 'ena-upload-cli v@VERSION@ @ Galaxy' + --action '$action_options.action' + --center '$action_options.center' + --secret \${credentials_path} + --data + #for $dataset in $files_to_upload: + '$dataset' + #end for +--action 'modify' +--experiment '$experiments_table_path' +--study '$studies_table_path' +--run '$runs_table_path' +--sample '$samples_table_path' +#if $action_options.input_format_conditional.input_format == "user_generated_tables": + #if "$action_options.input_format_conditional.viral_submission" == "true": + --checklist ERC000033 + #end if +#else: + #if $action_options.input_format_conditional.input_format == "build_tables": + #if $action_options.input_format_conditional.conditional_viral_metadata.viral_sample == "true": + --checklist ERC000033 + #end if + #else: + #if $action_options.input_format_conditional.viral_submission == "true": + --checklist ERC000033 + #end if + #end if +#end if + >> '$output'; +#end if echo -e 'center_name\t$action_options.center' >> '$output'; echo -e 'action_option\t$action_options.action' >> '$output'; #if $action_options.input_format_conditional.input_format != "user_generated_tables": @@ -232,6 +320,7 @@ </when> <when value="modify"> <expand macro="test_submit_section"/> + <param name="test_submit" type="hidden" value="False" /> <expand macro="table_inputs_macro" /> </when> </conditional> @@ -252,7 +341,7 @@ </data> </outputs> <tests> - <!--Test excel input of VIRAL samples --> + <!--Test 1: excel input of VIRAL samples --> <test> <conditional name="action_options"> <param name="action" value="add"/> @@ -265,6 +354,7 @@ <param name="viral_submission" value="True"/> <param name="xlsx_file" value="metadata_test_viral.xlsx"/> <conditional name="run_input_format_conditional"> + <param name="add_extension" value="true"/> <param name="run_input_format" value="multiple_selection_list"/> <param name="data" value="sample.fq"/> </conditional> @@ -276,7 +366,7 @@ <has_n_lines n="5"/> <has_n_columns n="17"/> <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" /> - <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/> + <has_line_matching expression="e_(.*)_026\tmodify\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/> </assert_contents> </output> <output name="studies_table_out"> @@ -284,14 +374,13 @@ <has_n_lines n="2"/> <has_n_columns n="8"/> <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/> - <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/> + <has_line_matching expression="SARS-CoV-2_genomes_01\tmodify\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/> </assert_contents> </output> <output name="samples_table_out"> <assert_contents> <has_n_lines n="5"/> <has_n_columns n="18"/> - <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date"/> </assert_contents> </output> <output name="runs_table_out"> @@ -299,11 +388,11 @@ <has_n_lines n="5"/> <has_n_columns n="8"/> <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/> - <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/> + <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/> </assert_contents> </output> </test> - <!--Test excel input of VIRAL samples with extended columns--> + <!--Test 2: excel input of VIRAL samples with extended columns--> <test> <conditional name="action_options"> <param name="action" value="add"/> @@ -316,6 +405,7 @@ <param name="viral_submission" value="True"/> <param name="xlsx_file" value="metadata_test_viral_optional_columns.xlsx"/> <conditional name="run_input_format_conditional"> + <param name="add_extension" value="true"/> <param name="run_input_format" value="multiple_selection_list"/> <param name="data" value="sample.fq"/> </conditional> @@ -327,7 +417,7 @@ <has_n_lines n="5"/> <has_n_columns n="17"/> <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_alias\tsample_alias\tdesign_description\tlibrary_name\tlibrary_strategy\tlibrary_source\tlibrary_selection\tlibrary_layout\tinsert_size\tlibrary_construction_protocol\tplatform\tinstrument_model\tsubmission_date" /> - <has_line_matching expression="e_(.*)_026\tadd\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/> + <has_line_matching expression="e_(.*)_026\tmodify\taccession_ena\tNanopore sequencing\tSARS-CoV-2_genomes_01\ts_(.*)"/> </assert_contents> </output> <output name="studies_table_out"> @@ -335,14 +425,13 @@ <has_n_lines n="2"/> <has_n_columns n="8"/> <has_line_matching expression="alias\tstatus\taccession\ttitle\tstudy_type\tstudy_abstract\tpubmed_id\tsubmission_date"/> - <has_line_matching expression="SARS-CoV-2_genomes_01\tadd\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/> + <has_line_matching expression="SARS-CoV-2_genomes_01\tmodify\tENA_accession\tWhole-genome sequencing of SARS-CoV-2 from Covid-19 patients\tWhole Genome Sequencing\tWhole-genome sequences of SARS-CoV-2 from oro-pharyngeal swabs obtained from Covid-19 patients(.*)"/> </assert_contents> </output> <output name="samples_table_out"> <assert_contents> <has_n_lines n="5"/> <has_n_columns n="42"/> - <has_line_matching expression="alias\ttitle\tscientific_name\tsample_description\tstatus\taccession\ttaxon_id\tsubmission_date\tgeographic_location\thost_common_name\thost_subject_id\thost_health_state\thost_sex\thost_scientific_name\tcollector_name\tcollecting_institution\tisolate\tcollection_date\tgeographic_location_latitude\tgeographic_location_longitude\tsample_capture_status\thost_disease_outcome\thost_age\tvirus_identifier\treceipt_date\tdefinition_for_seropositive_sample\tserotype\thost_habitat\tisolation_source_host_associated\thost_behaviour\tisolation_source_non_host_associated\tsubject_exposure\tsubject_exposure_duration\ttype_exposure\tpersonal_protective_equipment\thospitalisation\tillness_duration\tillness_symptoms\tsample_storage_conditions\tstrain\thost_description\tgravidity"/> </assert_contents> </output> <output name="runs_table_out"> @@ -350,11 +439,11 @@ <has_n_lines n="5"/> <has_n_columns n="8"/> <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/> - <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/> + <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/> </assert_contents> </output> </test> - <!--Test excel input of NON-VIRAL samples--> + <!--Test 3: excel input of NON-VIRAL samples--> <test> <conditional name="action_options"> <param name="action" value="add"/> @@ -367,6 +456,7 @@ <param name="viral_submission" value="False"/> <param name="xlsx_file" value="metadata_test_nonviral.xlsx"/> <conditional name="run_input_format_conditional"> + <param name="add_extension" value="true"/> <param name="run_input_format" value="multiple_selection_list"/> <param name="data" value="sample.fq"/> </conditional> @@ -399,11 +489,11 @@ <has_n_lines n="5"/> <has_n_columns n="8"/> <has_line_matching expression="alias\tstatus\taccession\texperiment_alias\tfile_name\tfile_format\tfile_checksum\tsubmission_date"/> - <has_line_matching expression="r_(.*)_026\tadd\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\tfile_checksum\tsubmission_date_ENA"/> + <has_line_matching expression="r_(.*)_026\tmodify\tena_run_accession\te_(.*)_026\tC026_exp5_clean.fastq.gz\tfastq\t\tsubmission_date_ENA"/> </assert_contents> </output> </test> - <!--Test failure on excel input of NON-VIRAL samples with runs PAIRED collection --> + <!--Test 4: failure on excel input of NON-VIRAL samples with runs PAIRED collection --> <test expect_failure="true"> <conditional name="action_options"> <param name="action" value="add"/> @@ -412,6 +502,7 @@ <param name="dry_run" value="false" /> </section> <conditional name="input_format_conditional"> + <param name="add_extension" value="true"/> <param name="input_format" value="excel_tables"/> <param name="viral_submission" value="False"/> <param name="xlsx_file" value="metadata_test_nonviral.xlsx"/> @@ -437,7 +528,7 @@ <has_text_matching expression="--action 'add' --center 'Some research center'"/> </assert_command> </test> - <!--Test build tables from user input fields NON-VIRAL samples--> + <!--Test 5: build tables from user input fields NON-VIRAL samples--> <test> <conditional name="action_options"> <param name="action" value="add"/> @@ -447,6 +538,7 @@ </section> <conditional name="input_format_conditional"> <param name="input_format" value="build_tables"/> + <param name="add_extension" value="true"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="False"/> <repeat name="rep_study"> @@ -507,7 +599,7 @@ </assert_contents> </output> </test> - <!--Test RUN failing build tables from user input fields NON-VIRAL samples--> + <!--Test 6: RUN failing build tables from user input fields NON-VIRAL samples--> <test expect_failure="true"> <conditional name="action_options"> <param name="action" value="add"/> @@ -517,6 +609,7 @@ </section> <conditional name="input_format_conditional"> <param name="input_format" value="build_tables"/> + <param name="add_extension" value="true"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="false"/> <repeat name="rep_study"> @@ -554,7 +647,8 @@ <has_text_matching expression="No ENA credentials defined"/> </assert_stdout> </test> - <!--Test with submit_test to skip credentials checksRUN failing build tables from user input fields NON-VIRAL samples--> + <!--Test 7: with submit_test to skip credentials checksRUN failing build tables from user input fields NON-VIRAL samples + also tests compression of uncompressed inputs and adding the .gz suffix --> <test expect_failure="true"> <conditional name="action_options"> <param name="action" value="add"/> @@ -564,6 +658,7 @@ </section> <param name="test_submit" value="True"/> <conditional name="input_format_conditional"> + <param name="add_extension" value="true"/> <param name="input_format" value="build_tables"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="false"/> @@ -590,7 +685,7 @@ <param name="instrument_model" value="Illumina HiSeq 4000"/> <repeat name="rep_runs"> <param name="run_base_name" value="run_from_hospital_X"/> - <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/> + <param name="upload_files" value="sample.fq,sample.fq" ftype="fastqsanger"/> </repeat> </repeat> </repeat> @@ -601,14 +696,15 @@ <param name="center" value="Some research center"/> <assert_command> <has_text_matching expression="ena-upload-cli"/> - <has_text_matching expression="--data '1.fastqsanger.gz' 'sample.fq'"/> + <has_text_matching expression="--data 'sample.fq.fastq.gz' 'sample.fq.fastq.gz'"/> <has_text_matching expression="--action 'add' --center 'Some research center'"/> + <not_has_text text="modify" /> </assert_command> <assert_stderr> <has_text_matching expression="Oops, the file test_fake_path does not exist"/> </assert_stderr> </test> - <!--test viral submission - User input metadata--> + <!--Test 8: viral submission - User input metadata - Add extension = False--> <test expect_failure="true"> <conditional name="action_options"> <param name="action" value="add"/> @@ -618,6 +714,73 @@ </section> <param name="test_submit" value="True"/> <conditional name="input_format_conditional"> + <param name="add_extension" value="False"/> + <param name="input_format" value="build_tables"/> + <conditional name="conditional_viral_metadata"> + <param name="viral_sample" value="true"/> + <repeat name="rep_study"> + <param name="study_title" value="Test study title"/> + <param name="study_abstract" value="Test study abstract"/> + <param name="study_type" value="Epigenetics"/> + <param name="study_pubmed_id" value="Test study pubmedID"/> + <repeat name="rep_sample"> + <param name="sample_title" value="Test Sample title"/> + <param name="sample_description" value="Test Sample description"/> + <param name="scientific_name" value="Test Sample scientific name"/> + <param name="tax_id" value="Test Sample tax_id"/> + <param name="collection_date" value="2020"/> + <param name="geo_location_country" value="Belgium"/> + <param name="host_common_name" value="Human"/> + <param name="host_subject_id" value="Patient_001"/> + <param name="host_health_state" value="healthy"/> + <param name="host_sex" value="female"/> + <param name="host_scientific_name" value="homo sapiens"/> + <param name="collector_name" value="John The Collector"/> + <param name="collecting_institution" value="Hospital 01"/> + <param name="isolate" value="sample_001"/> + <repeat name="rep_experiment"> + <param name="experiment_title" value="Test experiment title"/> + <param name="experiment_design" value="Test experiment design description"/> + <param name="library_strategy" value="CTS"/> + <param name="library_source" value="GENOMIC"/> + <param name="library_selection" value="PCR"/> + <param name="library_layout" value="SINGLE"/> + <param name="insert_size" value="150"/> + <param name="library_construction_protocol" value="Test library construction"/> + <param name="platform" value="ILLUMINA"/> + <param name="instrument_model" value="Illumina HiSeq 4000"/> + <repeat name="rep_runs"> + <param name="run_base_name" value="run_from_hospital_X"/> + <param name="upload_files" value="1.fastqsanger.gz,2.fastqsanger.gz" ftype="fastqsanger.gz"/> + </repeat> + </repeat> + </repeat> + </repeat> + </conditional> + </conditional> + </conditional> + <param name="center" value="Some research center"/> + <assert_command> + <has_text_matching expression="ena-upload-cli"/> + <has_text_matching expression="--data '1.fastqsanger.gz' '2.fastqsanger.gz'"/> + <has_text_matching expression="--action 'add' --center 'Some research center'"/> + <has_text_matching expression="--checklist ERC000033"/> + </assert_command> + <assert_stderr> + <has_text_matching expression="Oops, the file test_fake_path does not exist"/> + </assert_stderr> + </test> + <!--Test 9: modify option and auto compression - viral submission - User input metadata--> + <test expect_failure="true"> + <conditional name="action_options"> + <param name="action" value="modify"/> + <section name="test_submit_parameters"> + <param name="submit_dev" value="false" /> + <param name="dry_run" value="false" /> + </section> + <param name="test_submit" value="True"/> + <conditional name="input_format_conditional"> + <param name="add_extension" value="False"/> <param name="input_format" value="build_tables"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" value="True"/> @@ -654,7 +817,7 @@ <param name="instrument_model" value="Illumina HiSeq 4000"/> <repeat name="rep_runs"> <param name="run_base_name" value="run_from_hospital_X"/> - <param name="upload_files" value="1.fastqsanger.gz,sample.fq" ftype="fastqsanger"/> + <param name="upload_files" value="sample.fq" ftype="fastqsanger"/> </repeat> </repeat> </repeat> @@ -665,9 +828,10 @@ <param name="center" value="Some research center"/> <assert_command> <has_text_matching expression="ena-upload-cli"/> - <has_text_matching expression="--data '1.fastqsanger.gz' 'sample.fq'"/> - <has_text_matching expression="--action 'add' --center 'Some research center'"/> - <has_text_matching expression="--vir"/> + <has_text_matching expression="--data 'sample.fq.gz'"/> + <has_text_matching expression="--action 'modify' --center 'Some research center'"/> + <has_text_matching expression="--checklist ERC000033"/> + <not_has_text text="add" /> </assert_command> <assert_stderr> <has_text_matching expression="Oops, the file test_fake_path does not exist"/>
--- a/process_xlsx.py Wed Aug 18 19:42:49 2021 +0000 +++ b/process_xlsx.py Tue Oct 19 15:57:14 2021 +0000 @@ -4,11 +4,24 @@ import xlrd import yaml +from check_remote import check_remote_entry from mappings import optional_samples_cols_mapping FILE_FORMAT = 'fastq' +def identify_action(entry_type, alias): + ''' define action ['add' | 'modify'] that needs to be perfomed for this entry ''' + query = {entry_type + '_alias': alias} + remote_accessions = check_remote_entry(entry_type, query) + if len(remote_accessions) > 0: + print(f'Found: {entry_type} entry with alias {alias}') + return 'modify' + else: + print(f'No {entry_type} entry found with alias {alias}') + return 'add' + + def extract_data(xl_sheet, expected_columns, optional_cols=None): """ 1. Check that the columns I expect are present in the sheet @@ -86,6 +99,7 @@ parser.add_argument('--out_dir', dest='out_path', required=True) parser.add_argument('--action', dest='action', required=True) parser.add_argument('--vir', dest='viral_submission', required=False, action='store_true') +parser.add_argument('--dev', dest='dev_submission', required=False, action='store_true') parser.add_argument('--verbose', dest='verbose', required=False, action='store_true') args = parser.parse_args() @@ -148,10 +162,10 @@ samples_cols = samples_cols + ['status', 'accession', 'taxon_id', 'submission_date'] if args.viral_submission: # extend the samples columns with the viral specific data - samples_cols = samples_cols + ['geographic_location', 'host_common_name', - 'host_subject_id', 'host_health_state', 'host_sex', - 'host_scientific_name', 'collector_name', - 'collecting_institution', 'isolate'] + samples_cols = samples_cols + ['geographic location (country and/or sea)', 'host common name', + 'host subject id', 'host health state', 'host sex', + 'host scientific name', 'collector name', + 'collecting institution', 'isolate'] if len(samples_optional_cols_loaded) > 0: for optional_cols_excel in samples_optional_cols_loaded: samples_cols.append(optional_samples_cols_mapping[optional_cols_excel]) @@ -168,7 +182,7 @@ runs_table.write('\t'.join(['alias', 'status', 'accession', 'experiment_alias', 'file_name', 'file_format', 'file_checksum', 'submission_date']) + '\n') action = args.action - +# actionable_items # WRITE DICTIONARIES TO TABLE FILES # ADD A TIMESTAMP TO THE ALIAS? SEEMS LIKE ENA REQUIRES ALL ENTRIES FOR A WEBIN TO HAVE UNIQUE IDS? @@ -178,14 +192,22 @@ exp_included = [] for study_alias, study in studies_dict.items(): # study_alias = study_alias + '_' + timestamp - studies_table.write('\t'.join([study_alias, action, 'ENA_accession', study['title'], + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('study', study_alias) + studies_table.write('\t'.join([study_alias, entry_action, 'ENA_accession', study['title'], study['study_type'], study['study_abstract'], '', 'ENA_submission_data']) + '\n') # assuming no pubmed_id for sample_alias, sample in samples_dict.items(): # sample_alias = sample_alias + '_' + timestamp + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('sample', sample_alias) samples_row_values = [sample_alias, sample['title'], sample['scientific_name'], - sample['sample_description'], action, 'ena_accession', - 'tax_id_updated_by_ENA', 'ENA_submission_date'] + sample['sample_description'], entry_action, 'ena_accession', + '', 'ENA_submission_date'] if args.viral_submission: # add the values that are unique for the viral samples if sample['collector name'] == '': @@ -230,7 +252,12 @@ # (not listed in the samples or study dict) # process the experiments for this sample if exp['sample_alias'] == sample_alias: - experiments_table.write('\t'.join([exp_alias, action, 'accession_ena', exp['title'], + # check the remote status + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('experiment', exp_alias) + experiments_table.write('\t'.join([exp_alias, entry_action, 'accession_ena', exp['title'], exp['study_alias'], sample_alias, exp['design_description'], exp['library_name'], exp['library_strategy'], exp['library_source'], @@ -250,9 +277,13 @@ runs_list = run for run_entry in runs_list: if run_entry['experiment_alias'] == exp_alias: - runs_table.write('\t'.join([run_alias, action, 'ena_run_accession', + if args.dev_submission: + entry_action = args.action + else: + entry_action = identify_action('run', run_alias) + runs_table.write('\t'.join([run_alias, entry_action, 'ena_run_accession', exp_alias, run_entry['file_name'], - FILE_FORMAT, 'file_checksum', + FILE_FORMAT, '', 'submission_date_ENA']) + '\n') runs_included.append(run_alias)
--- a/samples_macros.xml Wed Aug 18 19:42:49 2021 +0000 +++ b/samples_macros.xml Tue Oct 19 15:57:14 2021 +0000 @@ -12,10 +12,11 @@ <option value="paired_list" selected="False">Input from a paired collection</option> </param> <when value="multiple_selection_list"> - <param name="data" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/> + <param name="add_extension" type="boolean" checked="False" label="Add .fastq.(gz,.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/> + <param name="data" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="Select individual datasets or a dataset collection" help="Names should match the compressed run's files names defined in the metadata"/> </when> <when value="paired_list"> - <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" /> + <param name="paired_end_collection" collection_type="list:paired" type="data_collection" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" label="List of paired-end runs files" help="Names should match the compressed run's files names defined in the metadata" /> </when> </conditional> </xml> @@ -40,6 +41,7 @@ <param name="runs_users_table" type="data" format="tabular" multiple="false" label="Runs table" help="Runs metadata file"/> </when> <when value="build_tables"> + <param name="add_extension" type="boolean" checked="false" label="Add .fastq.(gz.bz2) extension to the Galaxy dataset names to match the ones described in the input tables?"/> <conditional name="conditional_viral_metadata"> <param name="viral_sample" type="boolean" truevalue="true" falsevalue="false" label="Does your submission contains viral samples?" /> <when value="true"> @@ -138,8 +140,8 @@ </options> </param> <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > - <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> - <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> + <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> + <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> </repeat> </repeat> </repeat> @@ -203,8 +205,8 @@ </options> </param> <repeat name="rep_runs" title="Runs executed within this experiment" min="1" > - <param name="run_base_name" type="text" optional="False" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> - <param name="upload_files" type="data" format="fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> + <param name="run_base_name" type="text" optional="False" value="" label="Run alias" help="If an alias is not provided it will be generated combining the sample and experiment indexes"/> + <param name="upload_files" type="data" format="fastq,fastqsanger,fastqsanger.gz,fastqsanger.bz2,fastq.gz,fastq.bz2" multiple="true" label="File(s) associated with this run"/> </repeat> </repeat> </repeat>