changeset 0:5e7401777990 draft default tip

planemo upload for repository https://github.com/galaxyproject/tools-iuc/tree/main/tools/fastq_dl commit 8da9481e027494c5fd881564d01d9e2ab55fe305
author iuc
date Sat, 16 Nov 2024 18:43:55 +0000
parents
children
files fastq_dl.xml macros.xml test-data/Metadata_files/DRR011117.tsv test-data/Metadata_files/ERR2651925.tsv test-data/Metadata_files/ERR4319712.tsv test-data/Metadata_files/SRR9678965.tsv test-data/accessions.txt
diffstat 7 files changed, 380 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/fastq_dl.xml	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,326 @@
+<tool id="fastq_dl" name="fastq-dl" version="@TOOL_VERSION@+galaxy@VERSION_SUFFIX@" profile="22.05">
+    <description>Download FASTQ files from ENA</description>
+    <macros>
+        <import>macros.xml</import>
+    </macros>
+    <expand macro="requirements"/>
+    <expand macro="creators"/>
+    <command detect_errors="aggressive"><![CDATA[
+    mkdir -p single-end paired-end logs &&
+    #if str($input_type.select_input_type) == "accession_ids"
+        IFS=' ' &&
+        read -ra accessionsarr <<< "$accessions" &&
+        @FASTQ_DL_FOR_LOOP@
+    #elif str($input_type.select_input_type) == "accessions_list"
+        mapfile -t accessionsarr < "$accessions_file" &&
+        @FASTQ_DL_FOR_LOOP@
+    #end if
+    #if str($only_download_metadata) == ""
+        &&
+        find . -maxdepth 1 -name "*_1.fastq.gz" -exec bash -c 'mv "\$0" "paired-end/\$(basename "\$0" | sed "s/_1/_forward/")"' {} \; &&
+        find . -maxdepth 1 -name "*_2.fastq.gz" -exec bash -c 'mv "\$0" "paired-end/\$(basename "\$0" | sed "s/_2/_reverse/")"' {} \; &&
+        find . -maxdepth 1 -name "*_R1.fastq.gz" -exec bash -c 'mv "\$0" "paired-end/\$(basename "\$0" | sed "s/_R1/_forward/")"' {} \; &&
+        find . -maxdepth 1 -name "*_R2.fastq.gz" -exec bash -c 'mv "\$0" "paired-end/\$(basename "\$0" | sed "s/_R2/_reverse/")"' {} \; &&
+        mv *.gz single-end > /dev/null 2>&1 || true
+    #end if
+    ]]></command>
+    <inputs>
+        <conditional name="input_type">
+            <param name="select_input_type" type="select" label="Select an input type">
+                <option value="accession_ids">ENA accession IDs</option>
+                <option value="accessions_list">A list of ENA accession IDs, one per row</option>
+            </param>
+            <when value="accession_ids">
+                <param name="accessions" type="text" label="Accession IDs" help="ENA accessions (Study, Sample, Experiment, Run accession) separated by whitespaces" optional="false" />
+            </when>
+            <when value="accessions_list">
+                <param name="accessions_file" type="data" format="txt" label="Accession IDs File" help="ENA accessions (Study, Sample, Experiment, Run accession) stored in a file. One accession per line" optional="false" />
+            </when>
+        </conditional>
+        <param name="group_by_experiment" type="boolean" label="Group by Experiment" help="Group Runs by experiment accession" truevalue="--group-by-experiment" falsevalue="" />
+        <param name="group_by_sample" type="boolean" label="Group by Sample" help="Group Runs by sample accession" truevalue="--group-by-sample" falsevalue="" />
+        <param name="only_download_metadata" type="boolean" label="Only Download Metadata" help="Skip FASTQ download and retrieve metadata only" truevalue="--only-download-metadata" falsevalue="" />
+    </inputs>
+    <outputs>
+        <collection name="metadata" type="list" label="Metadata files">
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\-fastq-run-info.tsv" directory="logs" ext="tsv" />
+        </collection>
+        <collection name="single_end_collection" type="list" label="Single-end data">
+            <filter>only_download_metadata == False</filter>
+            <discover_datasets pattern="(?P&lt;designation&gt;.+)\.fastq\.gz" directory="single-end" ext="fastq.gz" />
+        </collection>
+        <collection name="paired_end_collection" type="list:paired" label="Paired-end data">
+            <filter>only_download_metadata == False</filter>
+            <discover_datasets pattern="(?P&lt;identifier_0&gt;[^_]+)_(?P&lt;identifier_1&gt;[^_]+)\.fastq.gz" directory="paired-end" ext="fastq.gz" />
+        </collection>
+    </outputs>
+    <tests>
+        <!-- #1 Testing single end and paired end using accessions file -->
+        <test expect_num_outputs="3">
+            <param name="select_input_type" value="accessions_list" />
+            <param name="accessions_file" value="accessions.txt" />
+            <output_collection name="metadata" type="list" count="4">
+                <element name="DRR011117" file="Metadata_files/DRR011117.tsv" />
+                <element name="ERR2651925" file="Metadata_files/ERR2651925.tsv" />
+                <element name="ERR4319712" file="Metadata_files/ERR4319712.tsv" />
+                <element name="SRR9678965" file="Metadata_files/SRR9678965.tsv" />
+            </output_collection>
+            <output_collection name="single_end_collection" type="list" count="2">
+                <element name="DRR011117" decompress="True">
+                    <assert_contents>
+                        <has_text text="@DRR011117.1 HXVJWSB01AD414/4" />
+                        <has_size size="23102" />
+                    </assert_contents>
+                </element>
+                <element name="SRR9678965" decompress="True">
+                    <assert_contents>
+                        <has_text text="@SRR9678965.1 HQCI9RE01A6I97/2" />
+                        <has_size size="2465043" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="paired_end_collection" type="list:paired" count="2">
+                <element name="ERR2651925" decompress="True">
+                    <element name="forward" decompress="True">
+                        <assert_contents>
+                            <has_text text="@ERR2651925.1 M01945:48:000000000-B9G5G:1:1102:16788:1675/1" />
+                            <has_size size="4977454" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse" decompress="True">
+                        <assert_contents>
+                            <has_text text="@ERR2651925.1 M01945:48:000000000-B9G5G:1:1102:16788:1675/2" />
+                            <has_size size="6079979" />
+                        </assert_contents>
+                    </element>
+                </element>
+                <element name="ERR4319712" decompress="True">
+                    <element name="forward" decompress="True">
+                        <assert_contents>
+                            <has_text text="@ERR4319712.1 M02944:93:000000000-ALWFJ:1:2105:13646:2309/1" />
+                            <has_size size="2104680" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse" decompress="True">
+                        <assert_contents>
+                            <has_text text="@ERR4319712.1 M02944:93:000000000-ALWFJ:1:2105:13646:2309/2" />
+                            <has_size size="2578613" />
+                        </assert_contents>
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+        <!-- #2 Testing single end and paired end using accessions as text input -->
+        <test expect_num_outputs="3">
+            <param name="select_input_type" value="accession_ids" />
+            <param name="accessions" value="ERR4319712 DRR011117 ERR2651925 SRR9678965" />
+            <output_collection name="metadata" type="list" count="4">
+                <element name="DRR011117" file="Metadata_files/DRR011117.tsv" />
+                <element name="ERR2651925" file="Metadata_files/ERR2651925.tsv" />
+                <element name="ERR4319712" file="Metadata_files/ERR4319712.tsv" />
+                <element name="SRR9678965" file="Metadata_files/SRR9678965.tsv" />
+            </output_collection>
+            <output_collection name="single_end_collection" type="list" count="2">
+                <element name="DRR011117">
+                    <assert_contents>
+                        <has_size size="23102" />
+                    </assert_contents>
+                </element>
+                <element name="SRR9678965">
+                    <assert_contents>
+                        <has_size size="2465043" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="paired_end_collection" type="list:paired" count="2">
+                <element name="ERR2651925">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="4977454" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="6079979" />
+                        </assert_contents>
+                    </element>
+                </element>
+                <element name="ERR4319712">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="2104680" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="2578613" />
+                        </assert_contents>
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+        <!-- #3 Testing only download metadata -->
+        <test expect_num_outputs="1">
+            <param name="select_input_type" value="accessions_list" />
+            <param name="accessions_file" value="accessions.txt" />
+            <param name="only_download_metadata" value="--only-download-metadata" />
+            <output_collection name="metadata" type="list" count="4">
+                <element name="DRR011117" file="Metadata_files/DRR011117.tsv" />
+                <element name="ERR2651925" file="Metadata_files/ERR2651925.tsv" />
+                <element name="ERR4319712" file="Metadata_files/ERR4319712.tsv" />
+                <element name="SRR9678965" file="Metadata_files/SRR9678965.tsv" />
+            </output_collection>
+        </test>
+        <!-- #4 Testing group by experiment -->
+        <test expect_num_outputs="3">
+            <param name="select_input_type" value="accessions_list" />
+            <param name="accessions_file" value="accessions.txt" />
+            <param name="group_by_experiment" value="--group-by-experiment" />
+            <output_collection name="metadata" type="list" count="4">
+                <element name="DRR011117" file="Metadata_files/DRR011117.tsv" />
+                <element name="ERR2651925" file="Metadata_files/ERR2651925.tsv" />
+                <element name="ERR4319712" file="Metadata_files/ERR4319712.tsv" />
+                <element name="SRR9678965" file="Metadata_files/SRR9678965.tsv" />
+            </output_collection>
+            <output_collection name="single_end_collection" type="list" count="2">
+                <element name="DRX010073">
+                    <assert_contents>
+                        <has_size size="23102" />
+                    </assert_contents>
+                </element>
+                <element name="SRX6439351">
+                    <assert_contents>
+                        <has_size size="2465043" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="paired_end_collection" type="list:paired" count="2">
+                <element name="ERX2668415">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="4977454" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="6079979" />
+                        </assert_contents>
+                    </element>
+                </element>
+                <element name="ERX4268079">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="2104680" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="2578613" />
+                        </assert_contents>
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+        <!-- #5 Testing group by sample -->
+        <test expect_num_outputs="3">
+            <param name="select_input_type" value="accessions_list" />
+            <param name="accessions_file" value="accessions.txt" />
+            <param name="group_by_sample" value="--group-by-sample" />
+            <output_collection name="metadata" type="list" count="4">
+                <element name="DRR011117" file="Metadata_files/DRR011117.tsv" />
+                <element name="ERR2651925" file="Metadata_files/ERR2651925.tsv" />
+                <element name="ERR4319712" file="Metadata_files/ERR4319712.tsv" />
+                <element name="SRR9678965" file="Metadata_files/SRR9678965.tsv" />
+            </output_collection>
+            <output_collection name="single_end_collection" type="list" count="2">
+                <element name="SAMD00008419">
+                    <assert_contents>
+                        <has_size size="23102" />
+                    </assert_contents>
+                </element>
+                <element name="SAMN12272107">
+                    <assert_contents>
+                        <has_size size="2465043" />
+                    </assert_contents>
+                </element>
+            </output_collection>
+            <output_collection name="paired_end_collection" type="list:paired" count="2">
+                <element name="SAMEA4724129">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="4977454" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="6079979" />
+                        </assert_contents>
+                    </element>
+                </element>
+                <element name="SAMEA7040559">
+                    <element name="forward">
+                        <assert_contents>
+                            <has_size size="2104680" />
+                        </assert_contents>
+                    </element>
+                    <element name="reverse">
+                        <assert_contents>
+                            <has_size size="2578613" />
+                        </assert_contents>
+                    </element>
+                </element>
+            </output_collection>
+        </test>
+    </tests>
+    <help><![CDATA[
+
+This tool downloads FASTQ files from the European Nucleotide Archive (ENA) based on a list of ENA accession IDs.
+You can provide either accession IDs in text format or upload a file containing accession IDs (one per line).
+The tool also allows you to group downloaded data by experiment or sample and can optionally retrieve only metadata
+without downloading the FASTQ files.
+
+Input Types
+-----------
+
+You can select from two types of inputs:
+
+1. **ENA Accession IDs (Text Input)**:
+   - Provide a list of ENA accession IDs (e.g., Study, Sample, Experiment, or Run accessions) separated by whitespace.
+   
+2. **Accession IDs File**:
+   - Provide a file containing a list of ENA accession IDs, one per line.
+
+Parameters
+----------
+
+- **Group by Experiment**:  
+   This option groups the downloaded runs by the experiment accession, which can be useful if you need to process 
+   data related to a specific experiment.
+
+- **Group by Sample**:  
+   This option groups the downloaded runs by the sample accession.
+
+- **Only Download Metadata**:  
+   Select this option if you only want to retrieve metadata without downloading the actual FASTQ files. This is 
+   useful if you need information about the runs but do not need the raw sequence data.
+
+Outputs
+-------
+
+The tool generates three types of outputs:
+
+1. **Metadata Files**:  
+   This collection contains metadata files for each accession, in `.tsv` format, which provide details about the 
+   corresponding run.
+
+2. **Single-End Data**:  
+   If the input FASTQ files contain single-end reads, those files will be placed into a separate collection.
+   In `.fastq.gz` format.
+
+3. **Paired-End Data**:  
+   If the input FASTQ files contain paired-end reads, those files will be grouped into pairs (forward and reverse).
+   The paired files will also be placed in a separate collection and will be in `.fastq.gz` format.
+
+    ]]></help>
+    <expand macro="citations"/>
+</tool>
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/macros.xml	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,42 @@
+<macros>
+    <token name="@TOOL_VERSION@">3.0.0</token>
+    <token name="@VERSION_SUFFIX@">0</token>
+    <xml name="requirements">
+        <requirements>
+            <requirement type="package" version="@TOOL_VERSION@">fastq-dl</requirement>
+        </requirements>
+    </xml>
+    <token name="@FASTQ_DL_FOR_LOOP@"><![CDATA[
+        for accessionid in "\${accessionsarr[@]}"; do
+            fastq-dl --accession "\$accessionid"
+            --provider ena
+            --only-provider
+            $only_download_metadata
+            $group_by_experiment
+            $group_by_sample
+            &&
+            mv fastq-run-info.tsv logs/"\$accessionid"-fastq-run-info.tsv > /dev/null 2>&1 || true;
+        done
+    ]]></token>
+    <xml name="citations">
+        <citations>
+            <citation type="bibtex">
+                <![CDATA[
+			        @software{petit2024fastq-dl,
+                        author       = {Robert A. Petit III and Micheal B. Hall and Gerry Tonkin-Hill and Jie Zhu and Timothy D. Read},
+                        title        = {{fastq-dl}: efficiently download FASTQ files from SRA or ENA repositories},
+                        version      = {2.0.2},
+                        year         = {2024},
+                        url          = {https://github.com/rpetit3/fastq-dl},
+                        note         = {Accessed: 2024-10-31}
+                    }
+			        ]]>
+        </citation>
+        </citations>
+    </xml>
+    <xml name="creators">
+        <creator>
+            <person givenName="Rand" familyName="Zoabi" url="https://github.com/RZ9082" identifier="https://orcid.org/0009-0000-2501-8053" />
+        </creator>
+    </xml>
+</macros>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Metadata_files/DRR011117.tsv	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,2 @@
+run_accession	experiment_title	sample_accession	project_name	submission_accession	library_min_fragment_size	bam_md5	assembly_software	library_prep_longitude	library_selection	pcr_isolation_protocol	chip_protocol	sequencing_primer_provider	serotype	environment_feature	last_updated	submitted_galaxy	extraction_protocol	germline	secondary_project	culture_collection	submission_tool	sra_bytes	read_strand	rna_purity_280_ratio	hi_c_protocol	collected_by	submitted_ftp	restriction_enzyme_target_sequence	isolate	fastq_bytes	instrument_platform	variety	sequencing_date_format	temperature	sra_aspera	ecotype	submitted_aspera	sampling_campaign	bam_ftp	tissue_lib	environmental_sample	control_experiment	sex	submitted_md5	checklist	fastq_galaxy	library_gen_protocol	specimen_voucher	library_prep_latitude	submitted_bytes	taxonomic_identity_marker	run_date	country	ncbi_reporting_standard	sample_description	sra_galaxy	sample_prep_interval	fastq_md5	secondary_study_accession	experimental_protocol	read_count	study_title	bio_material	rna_prep_5_protocol	host_body_site	local_environmental_context	assembly_quality	collection_date_end	sample_capture_status	sample_title	host_genotype	host_phenotype	environmental_medium	cultivar	instrument_model	faang_library_selection	target_gene	bam_bytes	library_max_fragment_size	experiment_target	sequencing_date	description	nominal_sdev	chip_ab_provider	environment_material	host_tax_id	sample_material	sample_storage_processing	sra_md5	cell_type	fastq_ftp	disease	sample_prep_interval_units	broker_name	sub_strain	base_count	library_strategy	restriction_site	serovar	investigation_type	location	library_source	sra_ftp	age	library_layout	experimental_factor	sequencing_primer_catalog	environment_biome	rna_purity_230_ratio	dnase_protocol	dev_stage	library_prep_date_format	bam_aspera	binning_software	datahub	rna_integrity_num	library_prep_date	location_start	marine_region	aligned	file_location	sample_collection	chip_target	nominal_length	broad_scale_environmental_context	sequencing_location	status	completeness_score	lon	fastq_aspera	tax_lineage	host_sex	library_pcr_isolation_protocol	sample_alias	mating_type	collection_date_start	sub_species	contamination_score	run_alias	restriction_enzyme	depth	submitted_read_type	library_construction_protocol	host_growth_conditions	collection_date	experiment_alias	host_gravidity	center_name	identified_by	cell_line	sampling_site	host	library_name	tag	first_created	lat	strain	experiment_accession	scientific_name	host_status	tax_id	study_accession	submitted_format	submitted_host_sex	bisulfite_protocol	altitude	rt_prep_protocol	host_scientific_name	bam_galaxy	accession	secondary_sample_accession	sample_storage	cage_protocol	sampling_platform	taxonomic_classification	location_end	protocol_label	elevation	salinity	sequencing_method	sequencing_primer_lot	first_public	transposase_protocol	study_alias
+DRR011117	454 GS Junior sequencing: HXVJWSB01__Yaku_0782__ITS3	SAMD00008419	rhizosphere metagenome	DRA001010					PCR						2015-06-19							163157								23102	LS454				fasp.sra.ebi.ac.uk:/vol1/drr/DRR011/DRR011117											ftp.sra.ebi.ac.uk/vol1/fastq/DRR011/DRR011117/DRR011117.fastq.gz						1352764800000		Generic	HXVJWSB01__Yaku_0782__ITS3	ftp.sra.ebi.ac.uk/vol1/drr/DRR011/DRR011117		b737064403c17493036beee0987a0556	DRP001052		104	Complex community structure of ectomycorrhizal, arbuscular-mycorrhizal and root-endophytic fungi in a mixed subtropical forest of ectomycorrhizal and arbuscular-mycorrhizal plants								HXVJWSB01__Yaku_0782__ITS3					454 GS Junior							454 GS Junior sequencing: HXVJWSB01__Yaku_0782__ITS3							105143a627249be0bcc20c7755c1adcf		ftp.sra.ebi.ac.uk/vol1/fastq/DRR011/DRR011117/DRR011117.fastq.gz					51498	AMPLICON					METAGENOMIC	ftp.sra.ebi.ac.uk/vol1/drr/DRR011/DRR011117		SINGLE										dcc_metagenome												public			fasp.sra.ebi.ac.uk:/vol1/fastq/DRR011/DRR011117/DRR011117.fastq.gz	1;2787823;12908;408169;410657;939928			SAMD00008419					DRR011117				We sequenced fungal ITS sequences based on a tag-encoded massively-parallel pyrosequencing. For each root sample, the entire ITS region and partial ribosomal large subunit region was amplified using the fungus-specific high-coverage primer ITS1F_KYO2 and the universal primer LR3 (http://www.biology.duke.edu /fungi/mycolab/primers.htm). PCR was conducted using a temperature profile of 95??C for 10 min, followed by 20 cycles at 94??C for 20 s, 50??C for 30 s, 72??C for 120 s and a final extension at 72??C for 7 min with a buffer system of Ampdirect Plus and BIOTAQ HS DNA Polymerase (Shimadzu, Kyoto, Japan). The PCR product of each root sample was subjected to a second PCR step targeting the ITS2 region. The second PCR was conducted with a universal primer ITS3_KYO2 fused with the 454 Adaptor A and each sample-specific molecular ID, and the reverse universal primer LR_KYO1b (5'-MGC WGC ATT CCC AAA CWA-3') fused with the 454 Adaptor B. A buffer system of Taq DNA Polymerase with Standard Taq Buffer (New England BioLabs, Ipswich, MA, USA) was used under a temperature profile of 95??C for 1 min, followed by 40 cycles at 94??C for 20 s, 50??C for 30 s, 72??C for 60 s and a final extension at 72??C for 7 min. The ITS amplicons of the second PCR step were subjected to pyrosequencing. The first 576 and the second 624 samples were sequenced separately using a GS Junior sequencer (Roche). The rbcL amplicons of the first 480 root samples were pooled and purified using ExoSAP-IT (GE Healthcare) and QIAquick PCR Purification Kit (QIAGEN). The sequencing of the first 576 samples was conducted as instructed by the manufacturer. Likewise, the amplicons of the remaining 624 samples were pooled and purified, and then sequenced in the second run.			DRX010073		KYOTO_HE					HXVJWSB01__Yaku_0782__ITS3		2014-04-28			DRX010073	rhizosphere metagenome		939928	PRJDB2078								DRR011117	DRS009918											2014-04-28		PRJDB2078
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Metadata_files/ERR2651925.tsv	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,2 @@
+run_accession	experiment_title	sample_accession	project_name	submission_accession	library_min_fragment_size	bam_md5	assembly_software	library_prep_longitude	library_selection	pcr_isolation_protocol	chip_protocol	sequencing_primer_provider	serotype	environment_feature	last_updated	submitted_galaxy	extraction_protocol	germline	secondary_project	culture_collection	submission_tool	sra_bytes	read_strand	rna_purity_280_ratio	hi_c_protocol	collected_by	submitted_ftp	restriction_enzyme_target_sequence	isolate	fastq_bytes	instrument_platform	variety	sequencing_date_format	temperature	sra_aspera	ecotype	submitted_aspera	sampling_campaign	bam_ftp	tissue_lib	environmental_sample	control_experiment	sex	submitted_md5	checklist	fastq_galaxy	library_gen_protocol	specimen_voucher	library_prep_latitude	submitted_bytes	taxonomic_identity_marker	run_date	country	ncbi_reporting_standard	sample_description	sra_galaxy	sample_prep_interval	fastq_md5	secondary_study_accession	experimental_protocol	read_count	study_title	bio_material	rna_prep_5_protocol	host_body_site	local_environmental_context	assembly_quality	collection_date_end	sample_capture_status	sample_title	host_genotype	host_phenotype	environmental_medium	cultivar	instrument_model	faang_library_selection	target_gene	bam_bytes	library_max_fragment_size	experiment_target	sequencing_date	description	nominal_sdev	chip_ab_provider	environment_material	host_tax_id	sample_material	sample_storage_processing	sra_md5	cell_type	fastq_ftp	disease	sample_prep_interval_units	broker_name	sub_strain	base_count	library_strategy	restriction_site	serovar	investigation_type	location	library_source	sra_ftp	age	library_layout	experimental_factor	sequencing_primer_catalog	environment_biome	rna_purity_230_ratio	dnase_protocol	dev_stage	library_prep_date_format	bam_aspera	binning_software	datahub	rna_integrity_num	library_prep_date	location_start	marine_region	aligned	file_location	sample_collection	chip_target	nominal_length	broad_scale_environmental_context	sequencing_location	status	completeness_score	lon	fastq_aspera	tax_lineage	host_sex	library_pcr_isolation_protocol	sample_alias	mating_type	collection_date_start	sub_species	contamination_score	run_alias	restriction_enzyme	depth	submitted_read_type	library_construction_protocol	host_growth_conditions	collection_date	experiment_alias	host_gravidity	center_name	identified_by	cell_line	sampling_site	host	library_name	tag	first_created	lat	strain	experiment_accession	scientific_name	host_status	tax_id	study_accession	submitted_format	submitted_host_sex	bisulfite_protocol	altitude	rt_prep_protocol	host_scientific_name	bam_galaxy	accession	secondary_sample_accession	sample_storage	cage_protocol	sampling_platform	taxonomic_classification	location_end	protocol_label	elevation	salinity	sequencing_method	sequencing_primer_lot	first_public	transposase_protocol	study_alias	library_prep_location	rna_prep_3_protocol	ph	sequencing_longitude	tissue_type	isolation_source
+ERR2651925	Illumina MiSeq paired end sequencing	SAMEA4724129		ERA1521386					PCR						2018-11-16	ftp.sra.ebi.ac.uk/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R2.fastq.gz						14052012				Lydie Kerdraon	ftp.sra.ebi.ac.uk/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R2.fastq.gz			4977454;6079979	ILLUMINA				fasp.sra.ebi.ac.uk:/vol1/err/ERR265/005/ERR2651925		fasp.sra.ebi.ac.uk:/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/run/ERR265/ERR2651925/HDG2C-2016-10-r2-ITS_ACAGAC_L001_R2.fastq.gz							37493c0f131917a24acf4c535dc5b6b0;47d07f4bc698f774cc66d5d3111b8e60	ERC000011	ftp.sra.ebi.ac.uk/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_2.fastq.gz				4821716;5918907			France		ITS1 region associated to Crop residues of Wheat (Rotation)	ftp.sra.ebi.ac.uk/vol1/err/ERR265/005/ERR2651925		6a1679e27fac1783e3c752cf89a6eb3f;d9795cac381bfcce1acb09a94729a280	ERP109315		49902	Microbial assemblages associated to crop residues						2016-10-31		Crop residues of Wheat (Rotation)				Soissons	Illumina MiSeq							Illumina MiSeq paired end sequencing							1fde860669a32db7f8a9e46b6491fa77		ftp.sra.ebi.ac.uk/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_2.fastq.gz					24951000	AMPLICON					METAGENOMIC	ftp.sra.ebi.ac.uk/vol1/err/ERR265/005/ERR2651925		PAIRED										dcc_metagenome									425			public			fasp.sra.ebi.ac.uk:/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/fastq/ERR265/005/ERR2651925/ERR2651925_2.fastq.gz	1;131567;2759;33090;35493;131221;3193;58023;78536;58024;3398;1437183;4447;1437197;4734;38820;4479;359160;147368;1648038;147389;1648030;4564;4565			HDG2C.2016.10.r2.ITS_ACAGAC		2016-09-30			ena-RUN-IRHS-19-06-2018-10:38:11:697-377			PAIRED;PAIRED	PCR based protocol		2016-10	ena-EXPERIMENT-IRHS-19-06-2018-10:38:11:697-377		IRHS					PCR_ITS1		2018-06-19			ERX2668415	Triticum aestivum		4565	PRJEB27255	FASTQ;FASTQ							ERR2651925	ERS2544267											2018-11-30		ena-STUDY-IRHS-12-06-2018-07:39:33:052-533						Crop residues
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Metadata_files/ERR4319712.tsv	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,2 @@
+run_accession	experiment_title	sample_accession	project_name	submission_accession	library_min_fragment_size	bam_md5	assembly_software	library_prep_longitude	library_selection	pcr_isolation_protocol	chip_protocol	sequencing_primer_provider	serotype	environment_feature	last_updated	submitted_galaxy	extraction_protocol	germline	secondary_project	culture_collection	submission_tool	sra_bytes	read_strand	rna_purity_280_ratio	hi_c_protocol	collected_by	submitted_ftp	restriction_enzyme_target_sequence	isolate	fastq_bytes	instrument_platform	variety	sequencing_date_format	temperature	sra_aspera	ecotype	submitted_aspera	sampling_campaign	bam_ftp	tissue_lib	environmental_sample	control_experiment	sex	submitted_md5	checklist	fastq_galaxy	library_gen_protocol	specimen_voucher	library_prep_latitude	submitted_bytes	taxonomic_identity_marker	run_date	country	ncbi_reporting_standard	sample_description	sra_galaxy	sample_prep_interval	fastq_md5	secondary_study_accession	experimental_protocol	read_count	study_title	bio_material	rna_prep_5_protocol	host_body_site	local_environmental_context	assembly_quality	collection_date_end	sample_capture_status	sample_title	host_genotype	host_phenotype	environmental_medium	cultivar	instrument_model	faang_library_selection	target_gene	bam_bytes	library_max_fragment_size	experiment_target	sequencing_date	description	nominal_sdev	chip_ab_provider	environment_material	host_tax_id	sample_material	sample_storage_processing	sra_md5	cell_type	fastq_ftp	disease	sample_prep_interval_units	broker_name	sub_strain	base_count	library_strategy	restriction_site	serovar	investigation_type	location	library_source	sra_ftp	age	library_layout	experimental_factor	sequencing_primer_catalog	environment_biome	rna_purity_230_ratio	dnase_protocol	dev_stage	library_prep_date_format	bam_aspera	binning_software	datahub	rna_integrity_num	library_prep_date	location_start	marine_region	aligned	file_location	sample_collection	chip_target	nominal_length	broad_scale_environmental_context	sequencing_location	status	completeness_score	lon	fastq_aspera	tax_lineage	host_sex	library_pcr_isolation_protocol	sample_alias	mating_type	collection_date_start	sub_species	contamination_score	run_alias	restriction_enzyme	depth	submitted_read_type	library_construction_protocol	host_growth_conditions	collection_date	experiment_alias	host_gravidity	center_name	identified_by	cell_line	sampling_site	host	library_name	tag	first_created	lat	strain	experiment_accession	scientific_name	host_status	tax_id	study_accession	submitted_format	submitted_host_sex	bisulfite_protocol	altitude	rt_prep_protocol	host_scientific_name	bam_galaxy	accession	secondary_sample_accession	sample_storage	cage_protocol	sampling_platform	taxonomic_classification	location_end	protocol_label	elevation	salinity	sequencing_method	sequencing_primer_lot	first_public	transposase_protocol	study_alias
+ERR4319712	Illumina MiSeq paired end sequencing	SAMEA7040559		ERA2762825					PCR						2020-07-06	ftp.sra.ebi.ac.uk/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R2.fastq.gz						4994168					ftp.sra.ebi.ac.uk/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R2.fastq.gz			2104680;2578613	ILLUMINA				fasp.sra.ebi.ac.uk:/vol1/err/ERR431/002/ERR4319712		fasp.sra.ebi.ac.uk:/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/run/ERR431/ERR4319712/STROMAEQ-100_TTCTTG_L001_R2.fastq.gz							59daac95e6d090255b2a9937d57300e5;f9ce64d451f71471cc3f636e5fecaf36	ERC000011	ftp.sra.ebi.ac.uk/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_2.fastq.gz				2055148;2522395					Equine Gut Microbiome	ftp.sra.ebi.ac.uk/vol1/err/ERR431/002/ERR4319712		a4f8e84258f24104cf60e9a1a50511df;49876c41e079a0b4adfa49e22cd1897e	ERP122744		16481	Strongyle Infection and Gut Microbiota: Profiling of Resistant and Susceptible Horses Over a Grazing Season								Equine Gut Microbiome					Illumina MiSeq							Illumina MiSeq paired end sequencing							8951789af9d5e44cf180f0c72a81fd00		ftp.sra.ebi.ac.uk/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_2.fastq.gz					8240500	AMPLICON					METAGENOMIC	ftp.sra.ebi.ac.uk/vol1/err/ERR431/002/ERR4319712		PAIRED										dcc_metagenome									300			public			fasp.sra.ebi.ac.uk:/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_1.fastq.gz;fasp.sra.ebi.ac.uk:/vol1/fastq/ERR431/002/ERR4319712/ERR4319712_2.fastq.gz	1;131567;2759;33154;33208;6072;33213;33511;7711;89593;7742;7776;117570;117571;8287;1338369;32523;32524;40674;32525;9347;1437010;314145;9787;9788;9789;9796			STROMAEQ-100					ena-RUN-VETERINARY Faculty-06-07-2020-09:59:55:879-100			PAIRED;PAIRED				ena-EXPERIMENT-VETERINARY Faculty-06-07-2020-09:59:55:879-100		VETERINARY Faculty					unspecified		2020-07-06			ERX4268079	Equus caballus		9796	PRJEB39250	FASTQ;FASTQ							ERR4319712	ERS4804749											2020-07-26		ena-STUDY-VETERINARY Faculty-06-07-2020-10:00:04:543-1358
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/Metadata_files/SRR9678965.tsv	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,2 @@
+run_accession	experiment_title	sample_accession	project_name	submission_accession	library_min_fragment_size	bam_md5	assembly_software	library_prep_longitude	library_selection	pcr_isolation_protocol	chip_protocol	sequencing_primer_provider	serotype	environment_feature	last_updated	submitted_galaxy	extraction_protocol	germline	secondary_project	culture_collection	submission_tool	sra_bytes	read_strand	rna_purity_280_ratio	hi_c_protocol	collected_by	submitted_ftp	restriction_enzyme_target_sequence	isolate	fastq_bytes	instrument_platform	variety	sequencing_date_format	temperature	sra_aspera	ecotype	submitted_aspera	sampling_campaign	bam_ftp	tissue_lib	environmental_sample	control_experiment	sex	submitted_md5	checklist	fastq_galaxy	library_gen_protocol	specimen_voucher	library_prep_latitude	submitted_bytes	taxonomic_identity_marker	run_date	country	ncbi_reporting_standard	sample_description	sra_galaxy	sample_prep_interval	fastq_md5	secondary_study_accession	experimental_protocol	read_count	study_title	bio_material	rna_prep_5_protocol	host_body_site	local_environmental_context	assembly_quality	collection_date_end	sample_capture_status	sample_title	host_genotype	host_phenotype	environmental_medium	cultivar	instrument_model	faang_library_selection	target_gene	bam_bytes	library_max_fragment_size	experiment_target	sequencing_date	description	nominal_sdev	chip_ab_provider	environment_material	host_tax_id	sample_material	sample_storage_processing	sra_md5	cell_type	fastq_ftp	disease	sample_prep_interval_units	broker_name	sub_strain	base_count	library_strategy	restriction_site	serovar	investigation_type	location	library_source	sra_ftp	age	library_layout	experimental_factor	sequencing_primer_catalog	environment_biome	rna_purity_230_ratio	dnase_protocol	dev_stage	library_prep_date_format	bam_aspera	binning_software	datahub	rna_integrity_num	library_prep_date	location_start	marine_region	aligned	file_location	sample_collection	chip_target	nominal_length	broad_scale_environmental_context	sequencing_location	status	completeness_score	lon	fastq_aspera	tax_lineage	host_sex	library_pcr_isolation_protocol	sample_alias	mating_type	collection_date_start	sub_species	contamination_score	run_alias	restriction_enzyme	depth	submitted_read_type	library_construction_protocol	host_growth_conditions	collection_date	experiment_alias	host_gravidity	center_name	identified_by	cell_line	sampling_site	host	library_name	tag	first_created	lat	strain	experiment_accession	scientific_name	host_status	tax_id	study_accession	submitted_format	submitted_host_sex	bisulfite_protocol	altitude	rt_prep_protocol	host_scientific_name	bam_galaxy	accession	secondary_sample_accession	sample_storage	cage_protocol	sampling_platform	taxonomic_classification	location_end	protocol_label	elevation	salinity	sequencing_method	sequencing_primer_lot	first_public	transposase_protocol	study_alias	library_prep_location	rna_prep_3_protocol	ph	sequencing_longitude	tissue_type	isolation_source
+SRR9678965	454 GS FLX sequencing: Amplicon seqencing of Homo sapiens: adult skin surface	SAMN12272107		SRA920837					PCR						2019-09-12							3528954								2465043	LS454				fasp.sra.ebi.ac.uk:/vol1/srr/SRR967/005/SRR9678965											ftp.sra.ebi.ac.uk/vol1/fastq/SRR967/005/SRR9678965/SRR9678965.fastq.gz							Germany: Duesseldorf	Metagenome or environmental	Metagenome or environmental sample from human skin metagenome	ftp.sra.ebi.ac.uk/vol1/srr/SRR967/005/SRR9678965		6c04efc21529bb1b4bbd2758435dd491	SRP214545		11487	16S rRNA gene profiling of atopic dermatitis and psoriasis patients compared to healthy volunteers						2016-12-31		Metagenome or environmental sample from human skin metagenome					454 GS FLX							454 GS FLX sequencing: Amplicon seqencing of Homo sapiens: adult skin surface				9606			9ba2d40c7ef09e349804bcae64879eae		ftp.sra.ebi.ac.uk/vol1/fastq/SRR967/005/SRR9678965/SRR9678965.fastq.gz					5809265	AMPLICON				51.15 N 6.48 E	METAGENOMIC	ftp.sra.ebi.ac.uk/vol1/srr/SRR967/005/SRR9678965		SINGLE										dcc_metagenome			51.15 N 6.48 E									public		6.48	fasp.sra.ebi.ac.uk:/vol1/fastq/SRR967/005/SRR9678965/SRR9678965.fastq.gz	1;2787823;12908;408169;410656;539655			MAARS Cohort		2016-01-01			plate1_3.007.01.sff.fastq						2016	plate1_3.007.01		SUB5942029				Homo sapiens	plate1_3.007.01	env_geo:terrestrial	2019-09-12	51.15		SRX6439351	human skin metagenome		539655	PRJNA554499						Homo sapiens		SRR9678965	SRS5093728					51.15 N 6.48 E						2019-09-12		PRJNA554499						skin surface
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/accessions.txt	Sat Nov 16 18:43:55 2024 +0000
@@ -0,0 +1,4 @@
+ERR4319712
+DRR011117
+ERR2651925
+SRR9678965