Mercurial > repos > yating-l > rename_scaffolds

--- a/rename.py	Mon Jun 25 15:29:18 2018 -0400
+++ b/rename.py	Thu Jul 26 13:02:23 2018 -0400
@@ -1,12 +1,15 @@
 """
 Call rename to rename scaffolds in reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and provide a name mapping file
+Call truncate to truncate the scaffold names that are more than 31 characters. Replace non-ASCII character with '_'
 """
 import sys
 import csv
+import codecs
+import string

 def rename(inputfile, outputfile, writer):
     with open(outputfile, 'w') as out:
-        with open(inputfile, 'r') as rf:
+        with codecs.open(inputfile, 'r', encoding='utf-8') as rf:
             lines = rf.readlines()
             i = 1
             for line in lines:
@@ -15,16 +18,49 @@
                     newname = "scaffold_" + str(i)
                     line = ">" + newname + "\n"
                     i = i+1
-                    writer.writerow([oldname, newname])
+                    writer.writerow([oldname.encode('utf-8'), newname])
                 out.write(line)

+def truncate(inputFile, outputFile):
+    names = []
+    with codecs.open(outputFile, 'w', encoding='utf-8') as out:
+        with codecs.open(inputFile, 'r', encoding='utf-8') as rf:
+            lines = rf.readlines()
+            for l in lines:
+                if ">" in l:
+                    print l.encode('utf-8')
+                    name = l[1:].rstrip()
+                    name = substituteNonAscii(name)
+                    if len(name) > 31:
+                        name = name[:31]
+                        print "\tTruncate the scaffold name to less than 31 characters: %s" % name
+                    if name in names:
+                        sys.exit("Name conflict! Name " + name + " already exist.")
+                    names.append(name)
+                    l = ">" + name + "\n"
+                    print "======================\n"
+                out.write(l)
+
+def substituteNonAscii(str):
+    l = []
+    for c in str:
+        if c not in string.printable:
+            print "\tSubstitute Non-ASCII character %s with _" % c.encode('utf-8')
+            c = '_'
+        l.append(c)
+    return "".join(l)
+
 def main():
     inputfile = str(sys.argv[1])
-    outputfile = str(sys.argv[2])
-    indexfile = str(sys.argv[3])
-    csvfile = open(indexfile, 'w')
-    writer = csv.writer(csvfile)
-    rename(inputfile, outputfile, writer)
+    manipulate = str(sys.argv[2])
+    outputfile = str(sys.argv[3])
+    if manipulate == "rename":
+        indexfile = str(sys.argv[4])
+        csvfile = open(indexfile, 'w')
+        writer = csv.writer(csvfile)
+        rename(inputfile, outputfile, writer)
+    elif manipulate == "truncate":
+        truncate(inputfile, outputfile)

 if __name__ == "__main__":
     main()
Binary file rename.pyc has changed
--- a/rename_scaffold.xml	Mon Jun 25 15:29:18 2018 -0400
+++ b/rename_scaffold.xml	Thu Jul 26 13:02:23 2018 -0400
@@ -1,29 +1,65 @@
-<tool id="rename_scaffold" name="rename the scaffolds" version="1.2">
-    <description>a Galaxy tool to rename the scaffolds in the reference genome so that they won't exceed 31 characters</description>
+<tool id="rename_scaffold" name="rename the scaffolds" version="2.0">
+    <description>a Galaxy tool to rename or truncate the scaffold names in the reference genome so that they won't exceed 31 characters</description>
 <stdio>
     <exit_code range="1:" />
 </stdio>
 <command><![CDATA[
-    python $__tool_directory__/rename.py $input $output $index
+    #if $manipulate_selector == "rename"
+      python $__tool_directory__/rename.py $input $manipulate_selector $output $index
+    #elif $manipulate_selector == "truncate"
+      python $__tool_directory__/rename.py $input $manipulate_selector $output
+    #end if
 ]]></command>
 <inputs>
     <param name="input" type="data" format="fasta"/>
+    <param name="manipulate_selector" type="select" label="Choose whether you want to rename the scaffolds or truncate the scaffold names">
+        <option value="truncate" selected="true">Truncate the scaffold names if they exceed 31 characters</option>
+        <option value="rename">Rename the scaffold names</option>
+    </param>
 </inputs>
 <outputs>
-    <data name="output" format="fasta" label="${tool.name} on ${on_string}: renamed_reference" />
-    <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping"/>
+    <data name="output" format="fasta" label="${tool.name} on ${on_string}: fixed_reference" />
+    <data name="index" format="csv" label="${tool.name} on ${on_string}: name mapping">
+        <filter>manipulate_selector == "rename"</filter>
+    </data>
 </outputs>
 <tests>
     <test>
-        <!-- Test with Dbia3.fa -->
+        <!-- Test rename input Dbia3.fa -->
         <param name="input" value="Dbia3.fa" />
+        <param name="manipulate_selector" value="rename" />
         <output name="output" file="Dbia3_renamed.fa"/>
         <output name="index" file="Dbia3_index.csv"/>
     </test>
+    <test>
+        <!-- Test truncate input Dbia3.fa -->
+        <param name="input" value="Dbia3.fa" />
+        <param name="manipulate_selector" value="truncate" />
+        <output name="output" file="Dbia3.fa"/>
+    </test>
+    <test>
+        <!-- Test rename input with non-ASCII charaters -->
+        <param name="input" value="sequence_with_noascii.fa" />
+        <param name="manipulate_selector" value="rename" />
+        <output name="output" file="renamed_sequence_with_noascii.fa" />
+        <output name="index" file="noascii_index.csv"/>
+    </test>
+    <test>
+        <!-- Test truncate input with non-ASCII charaters -->
+        <param name="input" value="sequence_with_noascii.fa" />
+        <param name="manipulate_selector" value="truncate" />
+        <output name="output" file="truncated_sequence_with_noascii.fa" />
+    </test>
+    <test expect_failure="true">
+        <!-- Test truncate input with non-ASCII charaters. Expect fail: name conflict! -->
+        <param name="input" value="sequence_with_noascii_name_conflict.fa" />
+        <param name="manipulate_selector" value="truncate" />
+    </test>
 </tests>
 <help><![CDATA[
-This tool is to rename scaffolds in the reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and also output a name mapping file
+This tool is to rename scaffolds in the reference genome so that the sequence names are less than 31 characters. Rename all scaffolds to scaffold_1, scaffold_2, ..., scaffold_N and also output a name mapping file.

+Or truncate the scaffold names that are more than 31 characters and replace each non-ASCII character with '_'
 ]]></help>
 <citations>
 </citations>
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/noascii_index.csv	Thu Jul 26 13:02:23 2018 -0400
@@ -0,0 +1,4 @@
+contig1'abcdé')* _ {/&,scaffold_1
+"contig2 | 757â763.ii215âii225Stanke, M. and Waack",scaffold_2
+"contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455",scaffold_3
+"contig4 | 757£763.ii215ôii225Stanke, M. and Waack",scaffold_4
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/renamed_sequence_with_noascii.fa	Thu Jul 26 13:02:23 2018 -0400
@@ -0,0 +1,12 @@
+>scaffold_1
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>scaffold_2
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>scaffold_3
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>scaffold_4
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence_with_noascii.fa	Thu Jul 26 13:02:23 2018 -0400
@@ -0,0 +1,12 @@
+>contig1'abcdé')* _ {/&
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig2 | 757â763.ii215âii225Stanke, M. and Waack
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig3 | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig4 | 757£763.ii215ôii225Stanke, M. and Waack
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/sequence_with_noascii_name_conflict.fa	Thu Jul 26 13:02:23 2018 -0400
@@ -0,0 +1,12 @@
+>contig'abcdé')* _ {/&
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig | 757â763.ii215âii225Stanke, M. and Waack
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig | prediction on sequence number 1 (length = 1992969, name = scaffold1|size1992969) ----- # # Constraints/Hints: # (none) # Predicted genes for sequence number 1 on both strands # start gene g1 scaffold1|size1992969 AUGUSTUS gene 17453 19382 0.11 + . g6 scaffold1|size1992969 AUGUSTUS transcript 17453 19382 0.11 + . g6.t1 scaffold1|size1992969 AUGUSTUS start_codon 17453 17455
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig | 757£763.ii215ôii225Stanke, M. and Waack
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/truncated_sequence_with_noascii.fa	Thu Jul 26 13:02:23 2018 -0400
@@ -0,0 +1,12 @@
+>contig1'abcd_')* _ {/&
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig2 | 757_763.ii215_ii225St
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig3 | prediction on sequenc
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT
+>contig4 | 757_763.ii215_ii225St
+AAAACTAATTTTATCAAAATCGGACAACTATATCATATAGCTGCCATACG
+AACGATCGGAAAATTGGTAAGTAAATAATTAAAAATATTATATCTTTGGT