Mercurial > repos > bgruening > split_file_to_collection
changeset 3:2ddc36385d7a draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/tools/text_processing/split_file_to_collection commit 8d069684e155d2f5b6fae06d14d98ce41321da53"
line wrap: on
line diff
--- a/split_file_to_collection.py Wed Aug 28 10:55:25 2019 -0400 +++ b/split_file_to_collection.py Tue Sep 10 12:31:15 2019 -0400 @@ -140,7 +140,7 @@ new_file_base = [custom_new_file_name, custom_new_file_ext] newfiles = [ - open(out_dir + "/" + new_file_base[0] + "_" + str(count) + new_file_base[1], "w") + open(os.path.join(out_dir, "%s_%06d%s" % (new_file_base[0], count, new_file_base[1])) , "w") for count in range(0, numnew) ] @@ -159,7 +159,7 @@ # keep track in loop of number of records in each file # only used in batch records_in_file = 0 - + # open file with open(in_file, "r") as file: record = ""
--- a/split_file_to_collection.xml Wed Aug 28 10:55:25 2019 -0400 +++ b/split_file_to_collection.xml Tue Sep 10 12:31:15 2019 -0400 @@ -50,8 +50,8 @@ --id_column '$split_parms.split_by.id_col' --match '$split_parms.split_by.match_regex' --sub '$split_parms.split_by.sub_regex' - #else - --numnew '$split_parms.split_by.numnew' + #else + --numnew '$split_parms.split_by.numnew' #if $split_parms.split_by.select_allocate.allocate == "random": --rand --seed '$split_parms.split_by.rand.seed' @@ -190,10 +190,10 @@ <param name="select_split_by" value="row"/> <param name="top" value="2"/> <param name="numnew" value="2"/> - <param name="newfilenames" value="test"/> + <param name="newfilenames" value="test"/> <output_collection name="list_output_tab" type="list"> - <element name="test_0.tabular" file="test_0.tabular" ftype="tabular"/> - <element name="test_1.tabular" file="test_1.tabular" ftype="tabular"/> + <element name="test_000000.tabular" file="test_0.tabular" ftype="tabular"/> + <element name="test_000001.tabular" file="test_1.tabular" ftype="tabular"/> </output_collection> </test> <test> @@ -203,10 +203,44 @@ <param name="top" value="2"/> <param name="numnew" value="2"/> <param name="newfilenames" value="batch_tab"/> - <param name="allocate" value="batch"/> + <param name="allocate" value="batch"/> <output_collection name="list_output_tab" type="list"> - <element name="batch_tab_0.tabular" file="batch_tab_0.tabular" ftype="tabular"/> - <element name="batch_tab_1.tabular" file="batch_tab_1.tabular" ftype="tabular"/> + <element name="batch_tab_000000.tabular" file="batch_tab_0.tabular" ftype="tabular"/> + <element name="batch_tab_000001.tabular" file="batch_tab_1.tabular" ftype="tabular"/> + </output_collection> + </test> + <test> + <param name="select_ftype" value="txt"/> + <param name="input" value="karyotype.txt" ftype="txt"/> + <param name="numnew" value="24"/> + <param name="newfilenames" value="chr"/> + <param name="allocate" value="batch"/> + + <output_collection name="list_output_txt" type="list"> + <element name="chr_000000.txt" file="chr_000000.txt" ftype="txt"/> + <element name="chr_000001.txt" file="chr_000001.txt" ftype="txt"/> + <element name="chr_000002.txt" file="chr_000002.txt" ftype="txt"/> + <element name="chr_000003.txt" file="chr_000003.txt" ftype="txt"/> + <element name="chr_000004.txt" file="chr_000004.txt" ftype="txt"/> + <element name="chr_000005.txt" file="chr_000005.txt" ftype="txt"/> + <element name="chr_000006.txt" file="chr_000006.txt" ftype="txt"/> + <element name="chr_000007.txt" file="chr_000007.txt" ftype="txt"/> + <element name="chr_000008.txt" file="chr_000008.txt" ftype="txt"/> + <element name="chr_000009.txt" file="chr_000009.txt" ftype="txt"/> + <element name="chr_000010.txt" file="chr_000010.txt" ftype="txt"/> + <element name="chr_000011.txt" file="chr_000011.txt" ftype="txt"/> + <element name="chr_000012.txt" file="chr_000012.txt" ftype="txt"/> + <element name="chr_000013.txt" file="chr_000013.txt" ftype="txt"/> + <element name="chr_000014.txt" file="chr_000014.txt" ftype="txt"/> + <element name="chr_000015.txt" file="chr_000015.txt" ftype="txt"/> + <element name="chr_000016.txt" file="chr_000016.txt" ftype="txt"/> + <element name="chr_000017.txt" file="chr_000017.txt" ftype="txt"/> + <element name="chr_000018.txt" file="chr_000018.txt" ftype="txt"/> + <element name="chr_000019.txt" file="chr_000019.txt" ftype="txt"/> + <element name="chr_000020.txt" file="chr_000020.txt" ftype="txt"/> + <element name="chr_000021.txt" file="chr_000021.txt" ftype="txt"/> + <element name="chr_000022.txt" file="chr_000022.txt" ftype="txt"/> + <element name="chr_000023.txt" file="chr_000023.txt" ftype="txt"/> </output_collection> </test> <test> @@ -230,29 +264,29 @@ <param name="numnew" value="3"/> <param name="newfilenames" value="demo"/> <output_collection name="list_output_mgf" type="list"> - <element name="demo_0.mgf" file="demo_0.mgf" ftype="mgf"/> - <element name="demo_1.mgf" file="demo_1.mgf" ftype="mgf"/> - <element name="demo_2.mgf" file="demo_2.mgf" ftype="mgf"/> + <element name="demo_000000.mgf" file="demo_0.mgf" ftype="mgf"/> + <element name="demo_000001.mgf" file="demo_1.mgf" ftype="mgf"/> + <element name="demo_000002.mgf" file="demo_2.mgf" ftype="mgf"/> </output_collection> </test> <test> <param name="input" value="test.fasta" ftype="fasta"/> <param name="select_ftype" value="fasta"/> <param name="numnew" value="2"/> - <param name="newfilenames" value="test"/> + <param name="newfilenames" value="test"/> <output_collection name="list_output_fasta" type="list"> - <element name="test_0.fasta" file="test_0.fasta" ftype="fasta"/> - <element name="test_1.fasta" file="test_1.fasta" ftype="fasta"/> + <element name="test_000000.fasta" file="test_0.fasta" ftype="fasta"/> + <element name="test_000001.fasta" file="test_1.fasta" ftype="fasta"/> </output_collection> </test> <test> <param name="input" value="test.fastq" ftype="fastq"/> <param name="select_ftype" value="fastq"/> <param name="numnew" value="2"/> - <param name="newfilenames" value="test"/> + <param name="newfilenames" value="test"/> <output_collection name="list_output_fastq" type="list"> - <element name="test_0.fastq" file="test_0.fastq" ftype="fastq"/> - <element name="test_1.fastq" file="test_1.fastq" ftype="fastq"/> + <element name="test_000000.fastq" file="test_0.fastq" ftype="fastq"/> + <element name="test_000001.fastq" file="test_1.fastq" ftype="fastq"/> </output_collection> </test> <test> @@ -261,10 +295,10 @@ <param name="numnew" value="2"/> <param name="newfilenames" value="rand"/> <param name="allocate" value="random"/> - <param name="seed" value="1010"/> + <param name="seed" value="1010"/> <output_collection name="list_output_fasta" type="list"> - <element name="rand_0.fasta" file="rand_0.fasta" ftype="fasta"/> - <element name="rand_1.fasta" file="rand_1.fasta" ftype="fasta"/> + <element name="rand_000000.fasta" file="rand_0.fasta" ftype="fasta"/> + <element name="rand_000001.fasta" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> <test> @@ -274,18 +308,18 @@ <param name="newfilenames" value="fasta_batch"/> <param name="allocate" value="batch"/> <output_collection name="list_output_fasta" type="list"> - <element name="fasta_batch_0.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> - <element name="fasta_batch_1.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> + <element name="fasta_batch_000000.fasta" file="fasta_batch_0.fasta" ftype="fasta"/> + <element name="fasta_batch_000001.fasta" file="fasta_batch_1.fasta" ftype="fasta"/> </output_collection> - </test> + </test> <test> <param name="input" value="test.tabular" ftype="txt"/> <param name="select_ftype" value="txt"/> <param name="numnew" value="2"/> - <param name="newfilenames" value="test"/> + <param name="newfilenames" value="test"/> <output_collection name="list_output_txt" type="list"> - <element name="test_0.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> - <element name="test_1.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> + <element name="test_000000.txt" file="test_0.tabular" ftype="txt" lines_diff="1"/> + <element name="test_000001.txt" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> <test> @@ -293,10 +327,10 @@ <param name="select_ftype" value="generic"/> <param name="generic_regex" value="^.*"/> <param name="numnew" value="2"/> - <param name="newfilenames" value="test"/> + <param name="newfilenames" value="test"/> <output_collection name="list_output_generic" type="list"> - <element name="test_0" file="test_0.tabular" ftype="txt" lines_diff="1"/> - <element name="test_1" file="test_1.tabular" ftype="txt" lines_diff="1"/> + <element name="test_000000" file="test_0.tabular" ftype="txt" lines_diff="1"/> + <element name="test_000001" file="test_1.tabular" ftype="txt" lines_diff="1"/> </output_collection> </test> <test> @@ -306,10 +340,10 @@ <param name="numnew" value="2"/> <param name="newfilenames" value="rand"/> <param name="allocate" value="random"/> - <param name="seed" value="1010"/> + <param name="seed" value="1010"/> <output_collection name="list_output_generic" type="list"> - <element name="rand_0" file="rand_0.fasta" ftype="fasta"/> - <element name="rand_1" file="rand_1.fasta" ftype="fasta"/> + <element name="rand_000000" file="rand_0.fasta" ftype="fasta"/> + <element name="rand_000001" file="rand_1.fasta" ftype="fasta"/> </output_collection> </test> <test> @@ -320,27 +354,27 @@ <param name="newfilenames" value="mol"/> <param name="allocate" value="batch"/> <output_collection name="list_output_generic" type="list"> - <element name="mol_0" file="mol_0.sdf" ftype="sdf"/> - <element name="mol_1" file="mol_1.sdf" ftype="sdf"/> - <element name="mol_2" file="mol_2.sdf" ftype="sdf"/> + <element name="mol_000000" file="mol_0.sdf" ftype="sdf"/> + <element name="mol_000001" file="mol_1.sdf" ftype="sdf"/> + <element name="mol_000002" file="mol_2.sdf" ftype="sdf"/> </output_collection> </test> </tests> <help><![CDATA[ **Split file into a dataset collection** -This tool splits a data sets consisting of records into multiple data sets within a collection. +This tool splits a data sets consisting of records into multiple data sets within a collection. A record can be for instance simply a line, a FASTA sequence (header + sequence), a FASTQ sequence (headers + sequence + qualities), etc. The important property is that the begin of a new record -can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. -The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. -For other data types the text delimiting records can be specified manually using the generic splitter. +can be speciefied by a regular expression, e.g. ".*" for lines, ">.*" for FASTA, or "@.*" for FASTQ. +The tool has presets for text, tabular data sets (which are split by line), FASTA, FASTQ, and MGF. +For other data types the text delimiting records can be specified manually using the generic splitter. -If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. +If splitting by line (or by some other item, like a FASTA entry or an MGF record, the splitting can be either done alternating, in original record order, or at random. If t records are to be distributed to n new data sets, then the i-th record goes to data set -* floor(i / t * n) (for batch), +* floor(i / t * n) (for batch), * i % n (for alternating), or * a random data set @@ -368,11 +402,11 @@ 4 2 1 1 = === === ==== -Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. +Note that there are no guarantees when splitting at random that every result file will be non-empty, so downstream tools should be able to gracefully handle empty files. If a tabular file is used as input, you may choose to split by line or by column. If split by column, a new file is created for each unique value in the column. In addition, (Python) regular expressions may be used to transform the value in the column to a new value. Caution should be used with this feature, as it could transform all values to the same value, or other unexpected behavior. -The default regular expression uses each value in the column without modifying it. +The default regular expression uses each value in the column without modifying it. ]]></help> <citations> <citation type="bibtex">
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000000.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr1 1 0 247249719 chr1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000001.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr2 2 0 242951149 chr2
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000002.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr3 3 0 199501827 chr3
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000003.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr4 4 0 191273063 chr4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000004.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr5 5 0 180857866 chr5
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000005.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr6 6 0 170899992 chr6
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000006.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr7 7 0 158821424 chr7
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000007.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr8 8 0 146274826 chr8
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000008.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr9 9 0 140273252 chr9
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000009.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr10 10 0 135374737 chr10
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000010.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr11 11 0 134452384 chr11
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000011.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr12 12 0 132349534 chr12
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000012.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr13 13 0 114142980 chr13
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000013.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr14 14 0 106368585 chr14
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000014.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr15 15 0 100338915 chr15
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000015.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr16 16 0 88827254 chr16
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000016.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr17 17 0 78774742 chr17
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000017.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr18 18 0 76117153 chr18
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000018.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr19 19 0 63811651 chr19
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000019.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr20 20 0 62435964 chr20
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000020.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr21 21 0 46944323 chr21
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000021.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chr22 22 0 49691432 chr22
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000022.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chrX x 0 154913754 chrx
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/chr_000023.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,1 @@ +chr - chrY y 0 57772954 chry
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/karyotype.txt Tue Sep 10 12:31:15 2019 -0400 @@ -0,0 +1,24 @@ +chr - chr1 1 0 247249719 chr1 +chr - chr2 2 0 242951149 chr2 +chr - chr3 3 0 199501827 chr3 +chr - chr4 4 0 191273063 chr4 +chr - chr5 5 0 180857866 chr5 +chr - chr6 6 0 170899992 chr6 +chr - chr7 7 0 158821424 chr7 +chr - chr8 8 0 146274826 chr8 +chr - chr9 9 0 140273252 chr9 +chr - chr10 10 0 135374737 chr10 +chr - chr11 11 0 134452384 chr11 +chr - chr12 12 0 132349534 chr12 +chr - chr13 13 0 114142980 chr13 +chr - chr14 14 0 106368585 chr14 +chr - chr15 15 0 100338915 chr15 +chr - chr16 16 0 88827254 chr16 +chr - chr17 17 0 78774742 chr17 +chr - chr18 18 0 76117153 chr18 +chr - chr19 19 0 63811651 chr19 +chr - chr20 20 0 62435964 chr20 +chr - chr21 21 0 46944323 chr21 +chr - chr22 22 0 49691432 chr22 +chr - chrX x 0 154913754 chrx +chr - chrY y 0 57772954 chry