Mercurial > repos > yating-l > rename_scaffolds
changeset 5:7c8b327f298c draft
planemo upload commit f565a59d3e28d34d1caf326fcee83d04a939c359
author | yating-l |
---|---|
date | Tue, 31 Jul 2018 13:50:21 -0400 |
parents | e35a3509c160 |
children | 2d143f0ac727 |
files | rename.py rename_scaffold.xml test-data/Dbia3_index.csv test-data/noascii_index.csv test-data/renamed_Dbia3_name_mapping.csv test-data/renamed_noascii_name_mapping.csv test-data/truncated_Dbia3_name_mapping.csv test-data/truncated_noascii_name_mapping.csv test-data/truncated_noascii_with_tab_name_mapping.csv |
diffstat | 9 files changed, 171 insertions(+), 95 deletions(-) [+] |
line wrap: on
line diff
--- a/rename.py Thu Jul 26 15:46:24 2018 -0400 +++ b/rename.py Tue Jul 31 13:50:21 2018 -0400 @@ -21,21 +21,22 @@ writer.writerow([oldname.encode('utf-8'), newname]) out.write(line) -def truncate(inputFile, outputFile, valid_characters): +def truncate(inputFile, outputFile, valid_characters, writer): names = [] - with codecs.open(outputFile, 'w', encoding='utf-8') as out: + with open(outputFile, 'w') as out: with codecs.open(inputFile, 'r', encoding='utf-8') as rf: lines = rf.readlines() for l in lines: if ">" in l: print l.encode('utf-8') - name = l[1:].rstrip() - name = substituteNonAscii(name, valid_characters) + oldname = l[1:].rstrip() + name = substituteNonAscii(oldname, valid_characters) if len(name) > 31: name = name[:31] print "\tTruncate the scaffold name to less than 31 characters: %s" % name if name in names: sys.exit("Name conflict! Name " + name + " already exist.") + writer.writerow([oldname.encode('utf-8'), name]) names.append(name) l = ">" + name + "\n" print "======================\n" @@ -54,14 +55,14 @@ inputfile = str(sys.argv[1]) manipulate = str(sys.argv[2]) outputfile = str(sys.argv[3]) - valid_characters = string.letters + string.punctuation + string.digits + ' ' + indexfile = str(sys.argv[4]) + csvfile = open(indexfile, 'w') + writer = csv.writer(csvfile) if manipulate == "rename": - indexfile = str(sys.argv[4]) - csvfile = open(indexfile, 'w') - writer = csv.writer(csvfile) rename(inputfile, outputfile, writer) elif manipulate == "truncate": - truncate(inputfile, outputfile, valid_characters) + valid_characters = string.letters + string.punctuation + string.digits + ' ' + truncate(inputfile, outputfile, valid_characters, writer) if __name__ == "__main__": main()
--- a/rename_scaffold.xml Thu Jul 26 15:46:24 2018 -0400 +++ b/rename_scaffold.xml Tue Jul 31 13:50:21 2018 -0400 @@ -1,14 +1,10 @@ -<tool id="rename_scaffold" name="rename the scaffolds" version="2.1"> +<tool id="rename_scaffold" name="rename the scaffolds" version="2.2"> <description>a Galaxy tool to rename or truncate the scaffold names in the reference genome so that they won't exceed 31 characters</description> <stdio> <exit_code range="1:" /> </stdio> <command><![CDATA[ - #if $manipulate_selector == "rename" - python $__tool_directory__/rename.py $input $manipulate_selector $output $index - #elif $manipulate_selector == "truncate" - python $__tool_directory__/rename.py $input $manipulate_selector $output - #end if + python $__tool_directory__/rename.py $input $manipulate_selector $output $index ]]></command> <inputs> <param name="input" type="data" format="fasta"/> @@ -19,9 +15,7 @@ </inputs> <outputs> <data name="output" format="fasta" label="${tool.name} on ${on_string}: fixed_reference" /> - <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping"> - <filter>manipulate_selector == "rename"</filter> - </data> + <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping" /> </outputs> <tests> <test> @@ -29,26 +23,28 @@ <param name="input" value="Dbia3.fa" /> <param name="manipulate_selector" value="rename" /> <output name="output" file="Dbia3_renamed.fa"/> - <output name="index" file="Dbia3_index.csv"/> + <output name="index" file="renamed_Dbia3_name_mapping.csv"/> </test> <test> <!-- Test truncate input Dbia3.fa --> <param name="input" value="Dbia3.fa" /> <param name="manipulate_selector" value="truncate" /> <output name="output" file="Dbia3.fa"/> + <output name="index" file="truncated_Dbia3_name_mapping.csv"/> </test> <test> <!-- Test rename input with non-ASCII charaters --> <param name="input" value="sequence_with_noascii.fa" /> <param name="manipulate_selector" value="rename" /> <output name="output" file="renamed_sequence_with_noascii.fa" /> - <output name="index" file="noascii_index.csv"/> + <output name="index" file="renamed_noascii_name_mapping.csv"/> </test> <test> <!-- Test truncate input with non-ASCII charaters --> <param name="input" value="sequence_with_noascii.fa" /> <param name="manipulate_selector" value="truncate" /> <output name="output" file="truncated_sequence_with_noascii.fa" /> + <output name="index" file="truncated_noascii_name_mapping.csv"/> </test> <test expect_failure="true"> <!-- Test truncate input with non-ASCII charaters. Expect fail: name conflict! --> @@ -60,12 +56,13 @@ <param name="input" value="sequence_with_tab.fa" /> <param name="manipulate_selector" value="truncate" /> <output name="output" file="fixed_reference_with_tab.fasta" /> + <output name="index" file="truncated_noascii_with_tab_name_mapping.csv"/> </test> </tests> <help><![CDATA[ This tool is to rename scaffolds in the reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and also output a name mapping file. -Or truncate the scaffold names that are more than 31 characters and replace each invalid character (non-ASCII, '\\t', '\\n', '\\x0b', '\\x0c', '\\r') with '_' +Or truncate the scaffold names that are more than 31 characters and replace each invalid character (non-ASCII, '\\t', '\\n', '\\x0b', '\\x0c', '\\r') with '_' and also output a name mapping file. ]]></help> <citations> </citations>
--- a/test-data/Dbia3_index.csv Thu Jul 26 15:46:24 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,70 +0,0 @@ -contig1,scaffold_1 -contig2,scaffold_2 -contig3,scaffold_3 -contig4,scaffold_4 -contig5,scaffold_5 -contig6,scaffold_6 -contig7,scaffold_7 -contig8,scaffold_8 -contig9,scaffold_9 -contig10,scaffold_10 -contig11,scaffold_11 -contig12,scaffold_12 -contig13,scaffold_13 -contig14,scaffold_14 -contig15,scaffold_15 -contig16,scaffold_16 -contig17,scaffold_17 -contig18,scaffold_18 -contig19,scaffold_19 -contig20,scaffold_20 -contig21,scaffold_21 -contig22,scaffold_22 -contig23,scaffold_23 -contig24,scaffold_24 -contig25,scaffold_25 -contig26,scaffold_26 -contig27,scaffold_27 -contig28,scaffold_28 -contig29,scaffold_29 -contig30,scaffold_30 -contig31,scaffold_31 -contig32,scaffold_32 -contig33,scaffold_33 -contig34,scaffold_34 -contig35,scaffold_35 -contig36,scaffold_36 -contig37,scaffold_37 -contig38,scaffold_38 -contig39,scaffold_39 -contig40,scaffold_40 -contig41,scaffold_41 -contig42,scaffold_42 -contig43,scaffold_43 -contig44,scaffold_44 -contig45,scaffold_45 -contig46,scaffold_46 -contig47,scaffold_47 -contig48,scaffold_48 -contig49,scaffold_49 -contig50,scaffold_50 -contig51,scaffold_51 -contig52,scaffold_52 -contig53,scaffold_53 -contig54,scaffold_54 -contig55,scaffold_55 -contig56,scaffold_56 -contig57,scaffold_57 -contig58,scaffold_58 -contig59,scaffold_59 -contig60,scaffold_60 -contig61,scaffold_61 -contig62,scaffold_62 -contig63,scaffold_63 -contig64,scaffold_64 -contig65,scaffold_65 -contig66,scaffold_66 -contig67,scaffold_67 -contig68,scaffold_68 -contig69,scaffold_69 -contig70,scaffold_70
--- a/test-data/noascii_index.csv Thu Jul 26 15:46:24 2018 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,4 +0,0 @@ -contig1'abcdé')* _ {/&,scaffold_1 -"contig2 | 757â763.ii215âii225Stanke, M. and Waack",scaffold_2 -"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",scaffold_3 -"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",scaffold_4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/renamed_Dbia3_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400 @@ -0,0 +1,70 @@ +contig1,scaffold_1 +contig2,scaffold_2 +contig3,scaffold_3 +contig4,scaffold_4 +contig5,scaffold_5 +contig6,scaffold_6 +contig7,scaffold_7 +contig8,scaffold_8 +contig9,scaffold_9 +contig10,scaffold_10 +contig11,scaffold_11 +contig12,scaffold_12 +contig13,scaffold_13 +contig14,scaffold_14 +contig15,scaffold_15 +contig16,scaffold_16 +contig17,scaffold_17 +contig18,scaffold_18 +contig19,scaffold_19 +contig20,scaffold_20 +contig21,scaffold_21 +contig22,scaffold_22 +contig23,scaffold_23 +contig24,scaffold_24 +contig25,scaffold_25 +contig26,scaffold_26 +contig27,scaffold_27 +contig28,scaffold_28 +contig29,scaffold_29 +contig30,scaffold_30 +contig31,scaffold_31 +contig32,scaffold_32 +contig33,scaffold_33 +contig34,scaffold_34 +contig35,scaffold_35 +contig36,scaffold_36 +contig37,scaffold_37 +contig38,scaffold_38 +contig39,scaffold_39 +contig40,scaffold_40 +contig41,scaffold_41 +contig42,scaffold_42 +contig43,scaffold_43 +contig44,scaffold_44 +contig45,scaffold_45 +contig46,scaffold_46 +contig47,scaffold_47 +contig48,scaffold_48 +contig49,scaffold_49 +contig50,scaffold_50 +contig51,scaffold_51 +contig52,scaffold_52 +contig53,scaffold_53 +contig54,scaffold_54 +contig55,scaffold_55 +contig56,scaffold_56 +contig57,scaffold_57 +contig58,scaffold_58 +contig59,scaffold_59 +contig60,scaffold_60 +contig61,scaffold_61 +contig62,scaffold_62 +contig63,scaffold_63 +contig64,scaffold_64 +contig65,scaffold_65 +contig66,scaffold_66 +contig67,scaffold_67 +contig68,scaffold_68 +contig69,scaffold_69 +contig70,scaffold_70
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/renamed_noascii_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400 @@ -0,0 +1,4 @@ +contig1'abcdé')* _ {/&,scaffold_1 +"contig2 | 757â763.ii215âii225Stanke, M. and Waack",scaffold_2 +"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",scaffold_3 +"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",scaffold_4
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/truncated_Dbia3_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400 @@ -0,0 +1,70 @@ +contig1,contig1 +contig2,contig2 +contig3,contig3 +contig4,contig4 +contig5,contig5 +contig6,contig6 +contig7,contig7 +contig8,contig8 +contig9,contig9 +contig10,contig10 +contig11,contig11 +contig12,contig12 +contig13,contig13 +contig14,contig14 +contig15,contig15 +contig16,contig16 +contig17,contig17 +contig18,contig18 +contig19,contig19 +contig20,contig20 +contig21,contig21 +contig22,contig22 +contig23,contig23 +contig24,contig24 +contig25,contig25 +contig26,contig26 +contig27,contig27 +contig28,contig28 +contig29,contig29 +contig30,contig30 +contig31,contig31 +contig32,contig32 +contig33,contig33 +contig34,contig34 +contig35,contig35 +contig36,contig36 +contig37,contig37 +contig38,contig38 +contig39,contig39 +contig40,contig40 +contig41,contig41 +contig42,contig42 +contig43,contig43 +contig44,contig44 +contig45,contig45 +contig46,contig46 +contig47,contig47 +contig48,contig48 +contig49,contig49 +contig50,contig50 +contig51,contig51 +contig52,contig52 +contig53,contig53 +contig54,contig54 +contig55,contig55 +contig56,contig56 +contig57,contig57 +contig58,contig58 +contig59,contig59 +contig60,contig60 +contig61,contig61 +contig62,contig62 +contig63,contig63 +contig64,contig64 +contig65,contig65 +contig66,contig66 +contig67,contig67 +contig68,contig68 +contig69,contig69 +contig70,contig70
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/truncated_noascii_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400 @@ -0,0 +1,4 @@ +contig1'abcdé')* _ {/&,contig1'abcd_')* _ {/& +"contig2 | 757â763.ii215âii225Stanke, M. and Waack",contig2 | 757_763.ii215_ii225St +"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",contig3 | prediction on sequenc +"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",contig4 | 757_763.ii215_ii225St
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/truncated_noascii_with_tab_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400 @@ -0,0 +1,4 @@ +contig1'abcdé * _ {/&,contig1'abcd__* _ {/& +"contig2 | 757â763.ii215âii225Stanke, M. and Waack",contig2_| 757_763.ii215_ii225St +"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",contig3 | prediction on sequenc +"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",contig4 | 757_763.ii215_ii225St