Mercurial > repos > iuc > ncbi_datasets

--- a/datasets_genome.xml	Fri Dec 26 17:17:02 2025 +0000
+++ b/datasets_genome.xml	Wed Jan 14 15:05:01 2026 +0000
@@ -65,20 +65,19 @@
     ## rehydrate
     && datasets rehydrate
         --directory ./
-        #if not $file_choices.decompress
-            --gzip
-        #end if
+        --gzip
         --max-workers \${NCBI_DATASETS_MAX_WORKERS:-10}

     ## rename all faa, fna (resp faa.gz, fna.gz) to fasta (resp fasta.gz) to allow discovery
     && find ncbi_dataset \( -name "*.faa" -o -name "*.fna" -o -name "*.faa.gz" -o -name "*.fna.gz" \) -exec sh -c 'mv {} \$(echo {} | sed "s/.f[an]a\(.gz\)\?\$/.fasta\1/")' \;

     ## unzip all compressed (non-fasta) files (jsonl files are just named .gz)
-    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called we --gzip)
+    ## note "not decompress" means that the datasets are provided uncompressed (datasets rehydrate is called with --gzip)
     ##      in this case we need to decompress all datasets that don't have a Galaxy datatype allowing for compression
     && find ncbi_dataset -name "*.jsonl.gz" -exec sh -c 'mv {} \$(dirname {})/\$(basename {} .gz)' \;
-    #if not $file_choices.decompress
-        && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+    && find ncbi_dataset \( -name "*.gz" ! -name "*fasta.gz" \) -exec gunzip {} \;
+    #if $file_choices.decompress
+        && find ncbi_dataset -name "*fasta.gz" -exec gunzip {} \;
     #end if

     #if "seq-report" in $file_choices.include
@@ -174,38 +173,43 @@
         </collection>
     </outputs>
     <tests>
+        <!-- download sequence and non-sequence data to test if unzipping works
+             sequence should be downloaded as gz and non-sequence unzipped
+
+             restrict download size for testing by using release data filtering
+             -->
         <test expect_num_outputs="3">
             <conditional name="query|subcommand">
                 <param name="download_by" value="taxon"/>
                 <param name="taxon_positional" value="human"/>
             </conditional>
             <section name="filters">
-                <param name="chromosomes" value="21"/>
-                <param name="released_before" value="01/01/2018"/>
+                <param name="released_after" value="08/31/2004"/>
+                <param name="released_before" value="01/01/2005"/>
             </section>
             <section name="file_choices">
-                <!-- include a sequence (which should be downloaded as fasta.gz)
-                     and one non-sequence (which should be decompressed) output -->
                 <param name="include" value="rna,gff3"/>
             </section>
             <output name="genome_data_report">
                 <assert_contents>
                     <has_text text="Assembly Accession&#9;Assembly Name&#9;Assembly Submitter&#9;Organism Name"/>
-                    <has_n_lines min="140"/>
+                    <!-- no idea why the report contains 2 entries, but only one is downloaded
+                         https://github.com/ncbi/datasets/issues/553 -->
+                    <has_n_lines n="3"/>
                     <has_n_columns n="4"/>
                 </assert_contents>
             </output>
-            <output_collection name="rna_fasta" type="list">
-                <element name="GCF_000306695.2" decompress="true">
+            <output_collection name="rna_fasta" type="list" count="1">
+                <element name="GCF_000002135.2" decompress="true" ftype="fasta.gz">
                     <assert_contents>
                         <has_text text="&gt;"/>
                     </assert_contents>
                 </element>
             </output_collection>
-            <output_collection name="genomic_gff" type="list">
-                <element name="GCF_000306695.2">
+            <output_collection name="genomic_gff" type="list" count="1">
+                <element name="GCF_000002135.2" ftype="gff3">
                     <assert_contents>
-                        <has_n_lines min="1000000"/>
+                        <has_n_lines min="40000"/>
                         <has_line line="##gff-version 3"/>
                         <has_n_columns n="9" comment="#"/>
                     </assert_contents>
@@ -485,11 +489,20 @@
                 <param name="taxon_positional" value="4932"/>
                 <param name="tax_exact_match" value="true"/>
             </conditional>
+            <section name="filters">
+                <param name="released_before" value="11/01/2012"/>
+            </section>
+            <section name="file_choices">
+                <param name="include" value="seq-report"/>
+                <param name="decompress" value="true"/>
+            </section>
             <output name="genome_data_report">
                 <assert_contents>
+                    <has_n_lines n="2"/>
                     <has_text text="Saccharomyces cerevisiae ZTW1" negate="true"/>
                 </assert_contents>
             </output>
+
         </test>
         <!-- test search filter -->
         <test expect_num_outputs="1">
@@ -498,6 +511,9 @@
                 <param name="taxon_positional" value="Streptococcus"/>
             </conditional>
             <section name="filters">
+                <param name="released_before" value="01/01/2010"/>
+            </section>
+            <section name="filters">
                 <repeat name="search">
                     <param name="search" value="pyogenes"/>
                 </repeat>
@@ -507,7 +523,8 @@
             </section>
             <output name="genome_data_report">
                 <assert_contents>
-                    <has_text text="pyogenes"/>
+                    <has_n_lines n="21"/>
+                    <has_text text="pyogenes" n="20"/>
                 </assert_contents>
             </output>
         </test>
--- a/macros.xml	Fri Dec 26 17:17:02 2025 +0000
+++ b/macros.xml	Wed Jan 14 15:05:01 2026 +0000
@@ -1,5 +1,5 @@
 <macros>
-    <token name="@TOOL_VERSION@">18.13.0</token>
+    <token name="@TOOL_VERSION@">18.14.0</token>
     <token name="@VERSION_SUFFIX@">0</token>
     <token name="@PROFILE@">23.0</token>
     <token name="@LICENSE@">MIT</token>
@@ -11,7 +11,7 @@
     <xml name="requirements">
         <requirements>
             <requirement type="package" version="@TOOL_VERSION@">ncbi-datasets-cli</requirement>
-            <requirement type="package" version="2025.8.3">ca-certificates</requirement>
+            <requirement type="package" version="2026.1.4">ca-certificates</requirement>
              <!-- Removed line below because it was causing "skipping: [..] need PK compat. v4.5 (can do v2.1)"  -->
              <!-- <requirement type="package" version="6.0">unzip</requirement> -->
         </requirements>
@@ -380,7 +380,7 @@
     </xml>
     <xml name="released_options" token_released_what="genomes" token_before_or_after="before">
         <param argument="--released-@BEFORE_OR_AFTER@" type="text" optional="true" label="Only include @RELEASED_WHAT@ that have been released @BEFORE_OR_AFTER@ a specified date (MM/DD/YYYY)">
-            <validator type="regex" message="enter a date in the form MM/DD/YYYY">[0-9]{2}/[0-9]{2}/[0-9]{4}</validator>
+            <!-- <validator type="regex" message="enter a date in the form MM/DD/YYYY">[0-9]{2}/[0-9]{2}/[0-9]{4}</validator> -->
         </param>
     </xml>
     <token name="@RELEASED_BEFORE@">#if $filters.released_before: