Previous changeset 2:1ecd8ce07db4 (2022-02-04) |
Commit message:
Uploaded |
modified:
ena_consensus_submit.xml process_input.py |
added:
test-data/.phiX.fasta.swp test-data/.receipt_sample.txt.swp test-data/phiX2.fasta.gz test-data/receipt_sample.txt test-data/receipt_sample_noPhiX.txt test-data/sample_alias_001.fasta.gz |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 ena_consensus_submit.xml --- a/ena_consensus_submit.xml Fri Feb 04 15:52:45 2022 +0000 +++ b/ena_consensus_submit.xml Tue Feb 22 11:03:34 2022 +0000 |
[ |
b'@@ -51,8 +51,18 @@\n echo -e \'MOLECULETYPE\\t$molecule_type\' >> $manifest_base;\n \n #if $metadata_file_or_form.metadata_format == "file":\n+ #import re\n+ #for $file in $metadata_file_or_form.genome_fasta:\n+ #if $file.is_of_type(\'fasta\'):\n+ #set $full_name = $file.element_identifier + \'.gz\'\n+ gzip -c $file > \'./fasta/$full_name\';\n+ #else:\n+ ln -s $file \'./fasta/$file.element_identifier\';\n+ #end if\n+\n+ #end for\n ## process the input tables, this creates an intermediate file with information\n- python3 \'$__tool_directory__/process_input.py\' $metadata_file_or_form.ena_receipt $genome_fasta \'./manifests\' \'./fasta\' $manifest_base;\n+ python3 \'$__tool_directory__/process_input.py\' $metadata_file_or_form.ena_receipt $genome_fasta_files \'./manifests\' $manifest_base >> $webin_cli_log;\n center_name=`grep \'center_name\' $metadata_file_or_form.ena_receipt | cut -f2,2 | tr -d \'\\n\'`;\n #else:\n #set $generated_manifest=\'./manifests/generated_manifest.txt\'\n@@ -64,6 +74,12 @@\n center_name=\'$metadata_file_or_form.center_name\';\n echo -e \'NAME\\t$metadata_file_or_form.assembly_name\' >> $generated_manifest;\n echo -e \'PLATFORM\\t$metadata_file_or_form.sequencing_platform\' >> $generated_manifest;\n+ #if $metadata_file_or_form.genome_fasta.is_of_type(\'fasta\'):\n+ gzip -c $metadata_file_or_form.genome_fasta > consensus.fasta.gz;\n+ #else:\n+ ln -s $metadata_file_or_form.genome_fasta consensus.fasta.gz;\n+ #end if\n+ echo -e \'FASTA\\tconsensus.fasta.gz\' >> $generated_manifest;\n #end if\n \n #set $outputs_dir = \'outputs\'\n@@ -73,6 +89,7 @@\n ## in case of errors, this list is empty\n while read line; do\n manifest=`echo \\$line | cut -d\' \' -f1,1`;\n+ echo "Submitting manifest \\$manifest" >> $webin_cli_log;\n ena-webin-cli\n -context genome\n -userName "\'\\$webin_id\'"\n@@ -84,12 +101,10 @@\n -validate\n #end if\n -outputDir $outputs_dir\n- >> $webin_cli_log ;\n- done < submit_list.tab\n+ >> $webin_cli_log;\n+ done < submit_list.tab;\n \n #else:\n- gzip -c $genome_fasta > consensus.fasta.gz;\n- echo -e \'FASTA\\tconsensus.fasta.gz\' >> $generated_manifest;\n ena-webin-cli\n #if $submit_test == "true":\n -test\n@@ -109,7 +124,7 @@\n >> $webin_cli_log ;\n ##cp ./genome/$metadata_file_or_form.assembly_name/validate/webin-cli.report $validate_output\n #end if\n-tar -cf $webin_cli_outputs $outputs_dir;\n+tar -cf $webin_cli_outputs $outputs_dir ;\n ]]></command>\n <configfiles>\n <configfile name="credentials"><![CDATA[\n@@ -121,15 +136,26 @@\n #end if\n \n ]]></configfile>\n+ <configfile name="genome_fasta_files">\n+#import json\n+#import re\n+#if $metadata_file_or_form.metadata_format == "file":\n+ #set $fasta_files_list = list()\n+ #for $file in $metadata_file_or_form.genome_fasta:\n+ $fasta_files_list.append(str($file.element_identifier))\n+ #end for\n+ #echo json.dumps($fasta_files_list)\n+#end if\n+ </configfile>\n </configfiles>\n <inputs>\n <param name="test_submit" type="hidden" value="False" />\n <param name="submit_test" type="boolean" truevalue="true" falsevalue="false" label="Submit to test server" help="use Webin test service instead of the production service. Please note that the Webin upload area is shared between test and production services, and that test submission files will not be archived." />\n <param name="dry_run" type="boolean" truevalue="true" falsevalue="false" label="Validate files and metadata but do not submit" help="Generate input files and run Webin-CLI with -validate option. If \'No\' is selected then it will validate and submit (-submit flag)"/>\n- <param name="genome_fasta" type="data" label="Select the consensus sequence assembly file" format="fasta"/>\n <param name="assembly_type" type="select" label="Assembly type">\n <option value="clone">Clone</option>\n <option value'..b'bel="Select the consensus sequence assembly file" format="fasta,fasta.gz"/>\n </when>\n </conditional>\n <param name="min_gap_length" type="text" optional="True" label="Minimum gap length (optional)"/>\n@@ -170,7 +198,6 @@\n <param name="submit_test" value="true" />\n <param name="dry_run" value="true" />\n <param name="test_submit" value="True" />\n- <param name="genome_fasta" value="phiX2.fasta"/>\n <param name="assembly_type" value="isolate"/>\n <param name="assembly_program" value="Test assembly program"/>\n <param name="molecule_type" value="viral cRNA"/>\n@@ -183,6 +210,7 @@\n <param name="sequencing_platform" value="Nanopore 0011"/>\n <param name="description" value="Test Description"/>\n <param name="center_name" value="Test center name"/>\n+ <param name="genome_fasta" value="phiX2.fasta"/>\n </conditional>\n <param name="min_gap_length" value="30"/>\n <output name="webin_cli_log">\n@@ -192,8 +220,52 @@\n </assert_contents>\n </output>\n </test>\n+ <test>\n+ <param name="submit_test" value="true" />\n+ <param name="dry_run" value="true" />\n+ <param name="test_submit" value="True" />\n+ <param name="assembly_type" value="isolate"/>\n+ <param name="assembly_program" value="Test assembly program"/>\n+ <param name="molecule_type" value="viral cRNA"/>\n+ <param name="coverage" value="10000"/>\n+ <conditional name="metadata_file_or_form">\n+ <param name="metadata_format" value="file"/>\n+ <param name="ena_receipt" value="receipt_sample_noPhiX.txt"/>\n+ <param name="genome_fasta" value="phiX2.fasta.gz,sample_alias_001.fasta.gz"/>\n+ </conditional>\n+ <param name="min_gap_length" value="30"/>\n+ <output name="webin_cli_log">\n+ <assert_contents>\n+ <has_text_matching expression="Processing phiX2"/>\n+ <has_text_matching expression="No metadata found for sample phiX2"/>\n+ <has_text_matching expression="Processing sample_alias_001"/>\n+ <has_text_matching expression="Submitting manifest ./manifests/sample_alias_001.manifest.txt"/>\n+ <has_text_matching expression="ERROR: Invalid submission account user name or password. Please try enclosing your password in single quotes."/>\n+ </assert_contents>\n+ </output>\n+ </test>\n+ <test>\n+ <param name="submit_test" value="true" />\n+ <param name="dry_run" value="true" />\n+ <param name="test_submit" value="True" />\n+ <param name="assembly_type" value="isolate"/>\n+ <param name="assembly_program" value="Test assembly program"/>\n+ <param name="molecule_type" value="viral cRNA"/>\n+ <param name="coverage" value="10000"/>\n+ <conditional name="metadata_file_or_form">\n+ <param name="metadata_format" value="file"/>\n+ <param name="ena_receipt" value="receipt_sample.txt"/>\n+ <param name="genome_fasta" value="sample_alias_001.fasta.gz"/>\n+ </conditional>\n+ <param name="min_gap_length" value="30"/>\n+ <output name="webin_cli_log">\n+ <assert_contents>\n+ <has_text_matching expression="ERROR: Invalid submission account user name or password. Please try enclosing your password in single quotes."/>\n+ </assert_contents>\n+ </output>\n+ </test>\n </tests>\n <help><![CDATA[\n- TODO: Fill in help.\n+ This tool is a wrapper for the ENA Webin CLI submission tool (https://ena-docs.readthedocs.io/en/latest/submit/general-guide/webin-cli.html).\n ]]></help>\n </tool>\n' |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 process_input.py --- a/process_input.py Fri Feb 04 15:52:45 2022 +0000 +++ b/process_input.py Tue Feb 22 11:03:34 2022 +0000 |
[ |
b'@@ -1,127 +1,127 @@\n import gzip\n+import json\n import os\n import sys\n import shutil\n import yaml\n \n-from Bio import SeqIO\n-\n-\n-"""\n-Takes as input:\n- 1. A receipt obtained from ENA submission tool. \n- A txt file that includes a YAML section with \n-\n- 2. A fasta file with fasta entries ids defined after the files used for the raw submission.\n-\n- 3. Path to write generated manifests\n- 4. Path to write generated fasta files\n- 5. manifest template path: the manifest with the global values set (e.g COVERAGE, MINGAPLENGHT..)\n-"""\n-\n-def get_section_string(f, start_line, end_line):\n+def get_section_string(f, start_line, end_line, return_string=False):\n # consume starting lines\n start_string = iter(f.readline, start_line)\n start_string = \'\'.join(line for line in start_string)\n # read YAML lines\n yaml_string = iter(f.readline, end_line)\n- return \'\'.join(x for x in yaml_string)\n+ if return_string:\n+ return \'\'.join(x for x in yaml_string)\n+ else:\n+ return [x for x in yaml_string]\n+ \n+def fill_from_yaml_data(yaml_only_dict, studies_samples_dict):\n+ # fill experiment information (platform) **** \n+ for index,exp in yaml_only_dict[\'ENA_experiment\'].items():\n+ study_alias = exp[\'study_alias\']\n+ sample_alias = exp[\'sample_alias\']\n+ if study_alias in studies_samples_dict.keys():\n+ if sample_alias in studies_samples_dict[study_alias].keys():\n+ studies_samples_dict[study_alias][sample_alias][\'experiments\'].append({\'platform\': exp[\'platform\']})\n+ else:\n+ studies_samples_dict[study_alias][sample_alias] = {\'experiments\': [{\'platform\': exp[\'platform\']}]}\n+ else:\n+ studies_samples_dict[study_alias] = {sample_alias: {\'experiments\':[{\'platform\': exp[\'platform\']}]}}\n+\n+\n+def load_receipt_data(input_file_path):\n+ # should do some health check of the input file?\n+ # load yaml section\n+ loaded_data = {} \n+ yaml_delimiter = \'YAML -------------\\n\'\n+ with open(input_file_path) as input_file:\n+ yaml_only_section = yaml.safe_load(get_section_string(input_file, start_line=yaml_delimiter, end_line=yaml_delimiter, return_string=True))\n+ fill_from_yaml_data(yaml_only_section, loaded_data)\n+ # read study accessions\n+ study_delimiter = \'Study accession details:\\n\'\n+ end_line = \'\\n\'\n+ with open(input_file_path) as input_file:\n+ studies_accession_lines = get_section_string(input_file, start_line=study_delimiter, end_line=end_line)\n+ # loaded_data[\'studies\'] = {}\n+ for study_line in studies_accession_lines:\n+ if study_line != \'\\n\':\n+ alias, accession, *_ = study_line.split(\'\\t\')\n+ try:\n+ loaded_data[alias][\'accession\'] = accession\n+ except KeyError:\n+ print(f"Experiment {exp} has unknown study or sample")\n+ # loaded_data[\'studies\'][alias][\'accession\'] = accession\n+ samples_delimiter = \'Sample accession details:\\n\'\n+ with open(input_file_path) as input_file:\n+ samples_accession_lines = get_section_string(input_file, start_line=samples_delimiter, end_line=end_line)\n+ ## need to iterate over all studies, because here I don\'t know which study is the sample from.\n+ # loaded_data[\'samples\'] = {}\n+ for sample_line in samples_accession_lines:\n+ if sample_line != \'\\n\':\n+ alias, accession, *_ = sample_line.split(\'\\t\')\n+ for study in loaded_data.keys():\n+ if alias in loaded_data[study].keys():\n+ loaded_data[study][alias][\'accession\'] = accession\n+ break\n+ return loaded_data\n+\n+\n+"""\n+Takes as input:\n+ 1. A receipt obtained from ENA submission tool: \n+ a txt file that contains sections describing submission details.\n+ 2. A json file with the list of fasta that the user loaded\n+ 3. Path to write generated manifests\n+ 4. Manifest template path: the manifest with the glob'..b' # first dump the contents of manifest template\n+ # containing the global vars\n+ with open(manifest_template) as m_template:\n+ output_handle.write(m_template.read())\n+ output_handle.write("ASSEMBLYNAME\\tconsensus_" + sample_alias + "\\n")\n+ output_handle.write("PLATFORM\\t" + platform + "\\n")\n+ output_handle.write("STUDY\\t" + study_accession + "\\n")\n+ output_handle.write("SAMPLE\\t" + sample_accession + "\\n")\n+ # files should be available in the corresponding dir and named:\n+ # sample_alias.fasta.gz \n+ output_handle.write("FASTA\\t" + sample_alias + \'.fasta.gz\' + "\\n")\n+ found_metadata = True\n+ written_manifests_out.write(manifest_path + \'\\n\')\n+ break\n+ if not found_metadata:\n+ print(f\'No metadata found for sample {sample_alias}\')\n \n- # and finally create a fasta file for each sequence (e.g named with the seq id or the run ID)\n- fasta_path = os.path.join(out_fasta_base, seq_id + \'.fasta\')\n- with open(fasta_path, "w") as output_handle:\n- SeqIO.write([record], output_handle, "fasta")\n- #gzip the file (required by ENA upload tool)\n- fasta_path_gz = fasta_path + \'.gz\'\n- with open(fasta_path, \'rb\') as f_in:\n- with gzip.open(fasta_path_gz, \'wb\') as f_out:\n- shutil.copyfileobj(f_in, f_out)\n- # create the manifest\n- # add to the manifest the: \n- # \n- manifest_path = os.path.join(out_manifest_base, seq_id + \'.manifest.txt\')\n- with open(manifest_path, "w") as output_handle:\n- # first dump the contents of manifest template\n- # containing the global vars\n- with open(manifest_template) as m_template:\n- output_handle.write(m_template.read())\n- output_handle.write("ASSEMBLYNAME\\tconsensus_" + seq_id + "\\n")\n- output_handle.write("PLATFORM\\t" + platform + "\\n")\n- output_handle.write("STUDY\\t" + study_alias + "\\n")\n- output_handle.write("SAMPLE\\t" + sample_alias + "\\n")\n- output_handle.write("FASTA\\t" + fasta_path_gz + "\\n")\n-\n- # ... and a dict (or tuple list???) that contains for each study - sample the name of the file that has the consensus sequence\n- # **** is it ok to use the unique ids of the study and sample in the manifest?? or should I use the accessions??\n- # in the latest case then I also need to parse the Study accession details: and Sample accession details: entries\n- # samples_dir[study][sample] = seq_id + \'.fasta\'\n- submission_tuples_list.append((manifest_path, fasta_path))\n-\n- with open(\'submit_list.tab\', "w") as output_handle:\n- for submit_tuple in submission_tuples_list:\n- output_handle.write(\'\\t\'.join(submit_tuple) + \'\\n\')\n- ## DEBUG CASE\n- #study details\n- # start_study = \'Study accession details:\\n\'\n- # empty_end = \'\\n\'\n- # study_data = get_section_string(input_file, start_line=start_study, end_line=empty_end)\n- # if len(study_data.split(\'\\n\')) > 2:\n- # # more than 1 study accession\n- # raise Exception("Multiple study accessions found")\n- # out_manifest.write(f\'STUDY\\t{study_data.split()[1]}\\n\')\n- # start_sample = \'Sample accession details:\\n\'\n- # sample_data = get_section_string(input_file, start_line=start_sample, end_line=empty_end)\n- # if len(sample_data.split(\'\\n\')) > 2:\n- # # more than 1 study accession\n- # raise Exception("Multiple sample accessions found")\n- # out_manifest.write(f\'SAMPLE\\t{sample_data.split()[1]}\\n\')\n- # platform = \'Ion Torrent\'\n- # out_manifest.write(f"PLATFORM\\t{platform}\\n")\n- # out_manifest.close()\n \n if __name__ == \'__main__\':\n main()\n' |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.phiX.fasta.swp |
b |
Binary file test-data/.phiX.fasta.swp has changed |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/.receipt_sample.txt.swp |
b |
Binary file test-data/.receipt_sample.txt.swp has changed |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/phiX2.fasta.gz |
b |
Binary file test-data/phiX2.fasta.gz has changed |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/receipt_sample.txt Tue Feb 22 11:03:34 2022 +0000 |
b |
@@ -0,0 +1,76 @@ +YAML ------------- +ENA_experiment: + 0: + alias: exp_test_alias_001 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: sample_alias_001 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test + 1: + alias: exp_test_alias_002 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: phiX2 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test +ENA_run: + 2: + alias: run_alias_001 + experiment_alias: exp_test_alias_001 + file_format: FASTQ + file_name: run001.fastq.gz +ENA_sample: + 2: + alias: sample_alias_001 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' + 2: + alias: phiX2 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' +ENA_study: + 2: + alias: study_alias_001 + study_abstract: "Help" + study_type: Whole Genome Sequencing + title: Whole genome sequencing of SARS-CoV-2 +YAML ------------- + +Printing receipt to ./receipt.xml + +Submission was done successfully + +Study accession details: +study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added + +Sample accession details: +sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added +phiX2 FAKESAMP002 2011-01-16T10:52:06.497+01:00 added + +Saving updates in new tsv tables:: +save updates in ./submission_files/studies_updated.tsv +save updates in ./submission_files/samples_updated.tsv +save updates in ./submission_files/experiments_updated.tsv +save updates in ./submission_files/runs_updated.tsv +action_option add |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/receipt_sample_noPhiX.txt --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/receipt_sample_noPhiX.txt Tue Feb 22 11:03:34 2022 +0000 |
b |
@@ -0,0 +1,54 @@ +YAML ------------- +ENA_experiment: + 0: + alias: exp_test_alias_001 + design_description: Lot's of coffe and magic + insert_size: 250.0 + instrument_model: NextSeq 500 + library_construction_protocol: Illumina COVIDSeq Test Kit + library_layout: PAIRED + library_name: Cov51 + library_selection: RT-PCR + library_source: VIRAL RNA + library_strategy: AMPLICON + platform: ILLUMINA + sample_alias: sample_alias_001 + study_alias: study_alias_001 + title: Illumina NextSeq paired end sequencing; Illumina COVIDSeq Test +ENA_run: + 2: + alias: run_alias_001 + experiment_alias: exp_test_alias_001 + file_format: FASTQ + file_name: run001.fastq.gz +ENA_sample: + 2: + alias: sample_alias_001 + collecting institution: Umbrella Corp. + collection date: '2021-05-03' + collector name: "John Doe" + definition for seropositive sample: '' +ENA_study: + 2: + alias: study_alias_001 + study_abstract: "Help" + study_type: Whole Genome Sequencing + title: Whole genome sequencing of SARS-CoV-2 +YAML ------------- + +Printing receipt to ./receipt.xml + +Submission was done successfully + +Study accession details: +study_alias_001 FAKE0001 2011-01-16T10:52:06.497+01:00 added + +Sample accession details: +sample_alias_001 FAKESAMP001 2011-01-16T10:52:06.497+01:00 added + +Saving updates in new tsv tables:: +save updates in ./submission_files/studies_updated.tsv +save updates in ./submission_files/samples_updated.tsv +save updates in ./submission_files/experiments_updated.tsv +save updates in ./submission_files/runs_updated.tsv +action_option add |
b |
diff -r 1ecd8ce07db4 -r 7d751b5943b0 test-data/sample_alias_001.fasta.gz |
b |
Binary file test-data/sample_alias_001.fasta.gz has changed |