Repository 'rename_scaffolds'
hg clone https://toolshed.g2.bx.psu.edu/repos/yating-l/rename_scaffolds

Changeset 5:7c8b327f298c (2018-07-31)
Previous changeset 4:e35a3509c160 (2018-07-26) Next changeset 6:2d143f0ac727 (2018-07-31)
Commit message:
planemo upload commit f565a59d3e28d34d1caf326fcee83d04a939c359
modified:
rename.py
rename_scaffold.xml
added:
test-data/renamed_Dbia3_name_mapping.csv
test-data/renamed_noascii_name_mapping.csv
test-data/truncated_Dbia3_name_mapping.csv
test-data/truncated_noascii_name_mapping.csv
test-data/truncated_noascii_with_tab_name_mapping.csv
removed:
test-data/Dbia3_index.csv
test-data/noascii_index.csv
b
diff -r e35a3509c160 -r 7c8b327f298c rename.py
--- a/rename.py Thu Jul 26 15:46:24 2018 -0400
+++ b/rename.py Tue Jul 31 13:50:21 2018 -0400
[
@@ -21,21 +21,22 @@
                     writer.writerow([oldname.encode('utf-8'), newname])
                 out.write(line)
 
-def truncate(inputFile, outputFile, valid_characters):
+def truncate(inputFile, outputFile, valid_characters, writer):
     names = []
-    with codecs.open(outputFile, 'w', encoding='utf-8') as out:
+    with open(outputFile, 'w') as out:
         with codecs.open(inputFile, 'r', encoding='utf-8') as rf:
             lines = rf.readlines()
             for l in lines:
                 if ">" in l:
                     print l.encode('utf-8')
-                    name = l[1:].rstrip()
-                    name = substituteNonAscii(name, valid_characters)
+                    oldname = l[1:].rstrip()
+                    name = substituteNonAscii(oldname, valid_characters)
                     if len(name) > 31:
                         name = name[:31]
                         print "\tTruncate the scaffold name to less than 31 characters: %s" % name
                     if name in names:
                         sys.exit("Name conflict! Name " + name + " already exist.")
+                    writer.writerow([oldname.encode('utf-8'), name])
                     names.append(name)
                     l = ">" + name + "\n"
                     print "======================\n"
@@ -54,14 +55,14 @@
     inputfile = str(sys.argv[1])
     manipulate = str(sys.argv[2])
     outputfile = str(sys.argv[3])
-    valid_characters = string.letters + string.punctuation + string.digits + ' '
+    indexfile = str(sys.argv[4])
+    csvfile = open(indexfile, 'w')
+    writer = csv.writer(csvfile)
     if manipulate == "rename":
-        indexfile = str(sys.argv[4])
-        csvfile = open(indexfile, 'w')
-        writer = csv.writer(csvfile)
         rename(inputfile, outputfile, writer)
     elif manipulate == "truncate":
-        truncate(inputfile, outputfile, valid_characters)
+        valid_characters = string.letters + string.punctuation + string.digits + ' '
+        truncate(inputfile, outputfile, valid_characters, writer)
 
 if __name__ == "__main__":
     main()
b
diff -r e35a3509c160 -r 7c8b327f298c rename_scaffold.xml
--- a/rename_scaffold.xml Thu Jul 26 15:46:24 2018 -0400
+++ b/rename_scaffold.xml Tue Jul 31 13:50:21 2018 -0400
[
@@ -1,14 +1,10 @@
-<tool id="rename_scaffold" name="rename the scaffolds" version="2.1">
+<tool id="rename_scaffold" name="rename the scaffolds" version="2.2">
     <description>a Galaxy tool to rename or truncate the scaffold names in the reference genome so that they won't exceed 31 characters</description>
 <stdio>
     <exit_code range="1:" />
 </stdio>
 <command><![CDATA[
-    #if $manipulate_selector == "rename"
-      python $__tool_directory__/rename.py $input $manipulate_selector $output $index
-    #elif $manipulate_selector == "truncate"
-      python $__tool_directory__/rename.py $input $manipulate_selector $output
-    #end if
+    python $__tool_directory__/rename.py $input $manipulate_selector $output $index
 ]]></command>
 <inputs>
     <param name="input" type="data" format="fasta"/>
@@ -19,9 +15,7 @@
 </inputs>
 <outputs>
     <data name="output" format="fasta" label="${tool.name} on ${on_string}: fixed_reference" />
-    <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping">
-        <filter>manipulate_selector == "rename"</filter>
-    </data>
+    <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping" />
 </outputs>
 <tests>
     <test>
@@ -29,26 +23,28 @@
         <param name="input" value="Dbia3.fa" />
         <param name="manipulate_selector" value="rename" />
         <output name="output" file="Dbia3_renamed.fa"/>
-        <output name="index" file="Dbia3_index.csv"/>
+        <output name="index" file="renamed_Dbia3_name_mapping.csv"/>
     </test>
     <test>
         <!-- Test truncate input Dbia3.fa -->
         <param name="input" value="Dbia3.fa" />
         <param name="manipulate_selector" value="truncate" />
         <output name="output" file="Dbia3.fa"/>
+        <output name="index" file="truncated_Dbia3_name_mapping.csv"/>
     </test>
     <test>
         <!-- Test rename input with non-ASCII charaters -->
         <param name="input" value="sequence_with_noascii.fa" />
         <param name="manipulate_selector" value="rename" />
         <output name="output" file="renamed_sequence_with_noascii.fa" />
-        <output name="index" file="noascii_index.csv"/>
+        <output name="index" file="renamed_noascii_name_mapping.csv"/>
     </test>
     <test>
         <!-- Test truncate input with non-ASCII charaters -->
         <param name="input" value="sequence_with_noascii.fa" />
         <param name="manipulate_selector" value="truncate" />
         <output name="output" file="truncated_sequence_with_noascii.fa" />
+        <output name="index" file="truncated_noascii_name_mapping.csv"/>
     </test>
     <test expect_failure="true">
         <!-- Test truncate input with non-ASCII charaters. Expect fail: name conflict! -->
@@ -60,12 +56,13 @@
         <param name="input" value="sequence_with_tab.fa" />
         <param name="manipulate_selector" value="truncate" />
         <output name="output" file="fixed_reference_with_tab.fasta" />
+        <output name="index" file="truncated_noascii_with_tab_name_mapping.csv"/>
     </test>
 </tests>
 <help><![CDATA[
 This tool is to rename scaffolds in the reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and also output a name mapping file.
 
-Or truncate the scaffold names that are more than 31 characters and replace each invalid character (non-ASCII, '\\t', '\\n', '\\x0b', '\\x0c', '\\r') with '_'
+Or truncate the scaffold names that are more than 31 characters and replace each invalid character (non-ASCII, '\\t', '\\n', '\\x0b', '\\x0c', '\\r') with '_' and also output a name mapping file.
 ]]></help>
 <citations>
 </citations>
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/Dbia3_index.csv
--- a/test-data/Dbia3_index.csv Thu Jul 26 15:46:24 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,70 +0,0 @@
-contig1,scaffold_1
-contig2,scaffold_2
-contig3,scaffold_3
-contig4,scaffold_4
-contig5,scaffold_5
-contig6,scaffold_6
-contig7,scaffold_7
-contig8,scaffold_8
-contig9,scaffold_9
-contig10,scaffold_10
-contig11,scaffold_11
-contig12,scaffold_12
-contig13,scaffold_13
-contig14,scaffold_14
-contig15,scaffold_15
-contig16,scaffold_16
-contig17,scaffold_17
-contig18,scaffold_18
-contig19,scaffold_19
-contig20,scaffold_20
-contig21,scaffold_21
-contig22,scaffold_22
-contig23,scaffold_23
-contig24,scaffold_24
-contig25,scaffold_25
-contig26,scaffold_26
-contig27,scaffold_27
-contig28,scaffold_28
-contig29,scaffold_29
-contig30,scaffold_30
-contig31,scaffold_31
-contig32,scaffold_32
-contig33,scaffold_33
-contig34,scaffold_34
-contig35,scaffold_35
-contig36,scaffold_36
-contig37,scaffold_37
-contig38,scaffold_38
-contig39,scaffold_39
-contig40,scaffold_40
-contig41,scaffold_41
-contig42,scaffold_42
-contig43,scaffold_43
-contig44,scaffold_44
-contig45,scaffold_45
-contig46,scaffold_46
-contig47,scaffold_47
-contig48,scaffold_48
-contig49,scaffold_49
-contig50,scaffold_50
-contig51,scaffold_51
-contig52,scaffold_52
-contig53,scaffold_53
-contig54,scaffold_54
-contig55,scaffold_55
-contig56,scaffold_56
-contig57,scaffold_57
-contig58,scaffold_58
-contig59,scaffold_59
-contig60,scaffold_60
-contig61,scaffold_61
-contig62,scaffold_62
-contig63,scaffold_63
-contig64,scaffold_64
-contig65,scaffold_65
-contig66,scaffold_66
-contig67,scaffold_67
-contig68,scaffold_68
-contig69,scaffold_69
-contig70,scaffold_70
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/noascii_index.csv
--- a/test-data/noascii_index.csv Thu Jul 26 15:46:24 2018 -0400
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
b
@@ -1,4 +0,0 @@
-contig1'abcdé')* _ {/&,scaffold_1
-"contig2 | 757â763.ii215âii225Stanke, M. and Waack",scaffold_2
-"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",scaffold_3
-"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",scaffold_4
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/renamed_Dbia3_name_mapping.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/renamed_Dbia3_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400
b
@@ -0,0 +1,70 @@
+contig1,scaffold_1
+contig2,scaffold_2
+contig3,scaffold_3
+contig4,scaffold_4
+contig5,scaffold_5
+contig6,scaffold_6
+contig7,scaffold_7
+contig8,scaffold_8
+contig9,scaffold_9
+contig10,scaffold_10
+contig11,scaffold_11
+contig12,scaffold_12
+contig13,scaffold_13
+contig14,scaffold_14
+contig15,scaffold_15
+contig16,scaffold_16
+contig17,scaffold_17
+contig18,scaffold_18
+contig19,scaffold_19
+contig20,scaffold_20
+contig21,scaffold_21
+contig22,scaffold_22
+contig23,scaffold_23
+contig24,scaffold_24
+contig25,scaffold_25
+contig26,scaffold_26
+contig27,scaffold_27
+contig28,scaffold_28
+contig29,scaffold_29
+contig30,scaffold_30
+contig31,scaffold_31
+contig32,scaffold_32
+contig33,scaffold_33
+contig34,scaffold_34
+contig35,scaffold_35
+contig36,scaffold_36
+contig37,scaffold_37
+contig38,scaffold_38
+contig39,scaffold_39
+contig40,scaffold_40
+contig41,scaffold_41
+contig42,scaffold_42
+contig43,scaffold_43
+contig44,scaffold_44
+contig45,scaffold_45
+contig46,scaffold_46
+contig47,scaffold_47
+contig48,scaffold_48
+contig49,scaffold_49
+contig50,scaffold_50
+contig51,scaffold_51
+contig52,scaffold_52
+contig53,scaffold_53
+contig54,scaffold_54
+contig55,scaffold_55
+contig56,scaffold_56
+contig57,scaffold_57
+contig58,scaffold_58
+contig59,scaffold_59
+contig60,scaffold_60
+contig61,scaffold_61
+contig62,scaffold_62
+contig63,scaffold_63
+contig64,scaffold_64
+contig65,scaffold_65
+contig66,scaffold_66
+contig67,scaffold_67
+contig68,scaffold_68
+contig69,scaffold_69
+contig70,scaffold_70
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/renamed_noascii_name_mapping.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/renamed_noascii_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400
b
@@ -0,0 +1,4 @@
+contig1'abcdé')* _ {/&,scaffold_1
+"contig2 | 757â763.ii215âii225Stanke, M. and Waack",scaffold_2
+"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",scaffold_3
+"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",scaffold_4
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/truncated_Dbia3_name_mapping.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/truncated_Dbia3_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400
b
@@ -0,0 +1,70 @@
+contig1,contig1
+contig2,contig2
+contig3,contig3
+contig4,contig4
+contig5,contig5
+contig6,contig6
+contig7,contig7
+contig8,contig8
+contig9,contig9
+contig10,contig10
+contig11,contig11
+contig12,contig12
+contig13,contig13
+contig14,contig14
+contig15,contig15
+contig16,contig16
+contig17,contig17
+contig18,contig18
+contig19,contig19
+contig20,contig20
+contig21,contig21
+contig22,contig22
+contig23,contig23
+contig24,contig24
+contig25,contig25
+contig26,contig26
+contig27,contig27
+contig28,contig28
+contig29,contig29
+contig30,contig30
+contig31,contig31
+contig32,contig32
+contig33,contig33
+contig34,contig34
+contig35,contig35
+contig36,contig36
+contig37,contig37
+contig38,contig38
+contig39,contig39
+contig40,contig40
+contig41,contig41
+contig42,contig42
+contig43,contig43
+contig44,contig44
+contig45,contig45
+contig46,contig46
+contig47,contig47
+contig48,contig48
+contig49,contig49
+contig50,contig50
+contig51,contig51
+contig52,contig52
+contig53,contig53
+contig54,contig54
+contig55,contig55
+contig56,contig56
+contig57,contig57
+contig58,contig58
+contig59,contig59
+contig60,contig60
+contig61,contig61
+contig62,contig62
+contig63,contig63
+contig64,contig64
+contig65,contig65
+contig66,contig66
+contig67,contig67
+contig68,contig68
+contig69,contig69
+contig70,contig70
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/truncated_noascii_name_mapping.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/truncated_noascii_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400
b
@@ -0,0 +1,4 @@
+contig1'abcdé')* _ {/&,contig1'abcd_')* _ {/&
+"contig2 | 757â763.ii215âii225Stanke, M. and Waack",contig2 | 757_763.ii215_ii225St
+"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",contig3 | prediction on sequenc
+"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",contig4 | 757_763.ii215_ii225St
b
diff -r e35a3509c160 -r 7c8b327f298c test-data/truncated_noascii_with_tab_name_mapping.csv
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/truncated_noascii_with_tab_name_mapping.csv Tue Jul 31 13:50:21 2018 -0400
b
@@ -0,0 +1,4 @@
+contig1'abcdé * _ {/&,contig1'abcd__* _ {/&
+"contig2 | 757â763.ii215âii225Stanke, M. and Waack",contig2_| 757_763.ii215_ii225St
+"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",contig3 | prediction on sequenc
+"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",contig4 | 757_763.ii215_ii225St