Mercurial > repos > artbio > mircounts

--- a/mature_mir_gff_translation.py	Tue Jul 25 11:04:28 2017 -0400
+++ b/mature_mir_gff_translation.py	Wed Jul 26 19:15:08 2017 -0400
@@ -1,6 +1,5 @@
 #!/usr/bin/env python

-import sys
 import argparse


@@ -9,24 +8,27 @@
     the_parser.add_argument(
         '--input', action="store", type=str, help="input miRBase GFF3 file")
     the_parser.add_argument(
-        '--output', action="store", type=str, help="output GFF3 file with converted mature mir coordinates")
+        '--output', action="store", type=str,
+        help="output GFF3 file with converted mature mir coordinates")
     args = the_parser.parse_args()
     return args

-GFF3_header= '''##gff-version 3
+
+GFF3_header = '''##gff-version 3
 ##generated by mature_mir_gff_translation.py
 #
 # Chromosomal coordinates of microRNAs ** relative to the hairpin precursors **
 # microRNAs:               miRBase current_version
 # genome-build-id:         check http://mirbase.org/
 #
-# Hairpin precursor sequences have type "miRNA_primary_transcript".
-# Note, these sequences do not represent the full primary transcript,
-# rather a predicted stem-loop portion that includes the precursor
+# Hairpin precursor sequences have type "miRNA_primary_transcript".
+# Note, these sequences do not represent the full primary transcript,
+# rather a predicted stem-loop portion that includes the precursor
 # miRNA. Mature sequences have type "miRNA".
 #
 '''

+
 def load_gff_in_dict(gff_input_file):
     '''
     Reads the gff3 file and return a dictionary of dictionaries
@@ -35,26 +37,29 @@
     '''
     gff_dict = {}
     for line in open(gff_input_file, "r"):
-        if line[0]=="#":
+        if line[0] == "#":
             continue
-        gff_fields=line[:-1].split("\t")
-        ID=gff_fields[8].split("ID=")[1].split(";")[0]
+        gff_fields = line[:-1].split("\t")
+        ID = gff_fields[8].split("ID=")[1].split(";")[0]
         gff_dict[ID] = {}
-        gff_dict[ID]["seqid"]=gff_fields[0]
-        gff_dict[ID]["source"]=gff_fields[1]
-        gff_dict[ID]["type"]=gff_fields[2]
-        gff_dict[ID]["start"]=gff_fields[3]
-        gff_dict[ID]["end"]=gff_fields[4]
-        gff_dict[ID]["score"]=gff_fields[5]
-        gff_dict[ID]["strand"]=gff_fields[6]
-        gff_dict[ID]["phase"]=gff_fields[7]
-        gff_dict[ID]["attributes"]=gff_fields[8]
+        gff_dict[ID]["seqid"] = gff_fields[0]
+        gff_dict[ID]["source"] = gff_fields[1]
+        gff_dict[ID]["type"] = gff_fields[2]
+        gff_dict[ID]["start"] = gff_fields[3]
+        gff_dict[ID]["end"] = gff_fields[4]
+        gff_dict[ID]["score"] = gff_fields[5]
+        gff_dict[ID]["strand"] = gff_fields[6]
+        gff_dict[ID]["phase"] = gff_fields[7]
+        gff_dict[ID]["attributes"] = gff_fields[8]
         if "Derives_from" in gff_dict[ID]["attributes"]:
-            parent_primary_transcript=gff_dict[ID]["attributes"].split("Derives_from=")[1]
-            parent_primary_transcript=gff_dict[parent_primary_transcript]["attributes"].split("Name=")[1]
-            gff_dict[ID]["attributes"]="%s;Parent_mir_Name=%s" % (gff_dict[ID]["attributes"], parent_primary_transcript)
+            parent_primary_transcript = gff_dict[ID]["attributes"].split(
+                "Derives_from=")[1]
+            parent_primary_transcript = gff_dict[parent_primary_transcript][
+                                            "attributes"].split("Name=")[1]
+            gff_dict[ID]["attributes"] = "%s;Parent_mir_Name=%s" % (
+                gff_dict[ID]["attributes"], parent_primary_transcript)
     return gff_dict
-
+

 def genome_to_mir_gff(gff_dict, output):
     '''
@@ -63,36 +68,50 @@
     Note that GFF files are 1-based coordinates
     '''
     for key in gff_dict:
-        name=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
-        gff_dict[key]["seqid"]=name
+        name = gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+        gff_dict[key]["seqid"] = name
         if "Derives_from=" in gff_dict[key]["attributes"]:
-            parent_ID=gff_dict[key]["attributes"].split("Derives_from=")[1].split(";")[0]
-            gff_dict[key]["start"]=str(int(gff_dict[key]["start"]) - int(gff_dict[parent_ID]["start"]) + 1)
-            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[parent_ID]["start"]) + 1)
-    hairpins={}
-    matures={}
-    for key in gff_dict:  ## treats miRNA_primary_transcript coordinates in a second loop to avoid errors in conversion
-        if gff_dict[key]["type"]=="miRNA_primary_transcript":
-            gff_dict[key]["end"]=str(int(gff_dict[key]["end"]) - int(gff_dict[key]["start"]) + 1)
-            gff_dict[key]["start"]="1"
+            parent_ID = gff_dict[key]["attributes"].split(
+                                        "Derives_from=")[1].split(";")[0]
+            gff_dict[key]["start"] = str(int(gff_dict[key]["start"])-int(
+                                       gff_dict[parent_ID]["start"])+1)
+            gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
+                                     gff_dict[parent_ID]["start"])+1)
+    hairpins = {}
+    matures = {}
+    # treats miRNA_primary_transcript coordinates
+    # in a second loop to avoid errors in conversion
+    for key in gff_dict:
+        if gff_dict[key]["type"] == "miRNA_primary_transcript":
+            gff_dict[key]["end"] = str(int(gff_dict[key]["end"])-int(
+                                       gff_dict[key]["start"]) + 1)
+            gff_dict[key]["start"] = '1'
             # now, do a dict[ID]=Name but only for miRNA_primary_transcript
-            hairpins[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+            hairpins[key] = gff_dict[key]["attributes"].split(
+                                                        "Name=")[1].split(
+                                                        ";")[0]
         else:
-            matures[key]=gff_dict[key]["attributes"].split("Name=")[1].split(";")[0]
+            matures[key] = gff_dict[key]["attributes"].split(
+                                                        "Name=")[1].split(
+                                                         ";")[0]
     with open(output, "w") as output:
         output.write(GFF3_header)
         for ID in sorted(hairpins, key=hairpins.get):
-            output.write("\t".join([gff_dict[ID]["seqid"], gff_dict[ID]["source"],
-                gff_dict[ID]["type"], gff_dict[ID]["start"], gff_dict[ID]["end"],
-                gff_dict[ID]["score"], gff_dict[ID]["strand"], gff_dict[ID]["phase"],
-                gff_dict[ID]["attributes"]]))
+            output.write("\t".join([gff_dict[ID]["seqid"],
+                         gff_dict[ID]["source"], gff_dict[ID]["type"],
+                         gff_dict[ID]["start"], gff_dict[ID]["end"],
+                         gff_dict[ID]["score"], gff_dict[ID]["strand"],
+                         gff_dict[ID]["phase"], gff_dict[ID]["attributes"]]))
             output.write("\n")
             for id in sorted(matures, key=matures.get, reverse=True):
                 if ID in gff_dict[id]["attributes"]:
-                    output.write("\t".join([gff_dict[id]["seqid"], gff_dict[id]["source"],
-                        gff_dict[id]["type"], gff_dict[id]["start"], gff_dict[id]["end"],
-                        gff_dict[id]["score"], gff_dict[id]["strand"],
-                        gff_dict[id]["phase"], gff_dict[id]["attributes"]]))
+                    output.write("\t".join([gff_dict[id]["seqid"],
+                                 gff_dict[id]["source"], gff_dict[id]["type"],
+                                 gff_dict[id]["start"], gff_dict[id]["end"],
+                                 gff_dict[id]["score"],
+                                 gff_dict[id]["strand"],
+                                 gff_dict[id]["phase"],
+                                 gff_dict[id]["attributes"]]))
                     output.write("\n")


@@ -103,4 +122,4 @@

 if __name__ == "__main__":
     args = Parser()
-    main(args.input, args.output)
\ No newline at end of file
+    main(args.input, args.output)
--- a/mircounts.xml	Tue Jul 25 11:04:28 2017 -0400
+++ b/mircounts.xml	Wed Jul 26 19:15:08 2017 -0400
@@ -1,4 +1,4 @@
-<tool id="mircounts" name="miRcounts" version="0.9.2">
+<tool id="mircounts" name="miRcounts" version="0.9.3">
     <description> Counts miRNA alignments from small RNA sequence data</description>
     <requirements>
         <requirement type="package" version="1.18">gnu-wget</requirement>
--- a/test-data/translated_dme.gff3	Tue Jul 25 11:04:28 2017 -0400
+++ b/test-data/translated_dme.gff3	Wed Jul 26 19:15:08 2017 -0400
@@ -5,9 +5,9 @@
 # microRNAs:               miRBase current_version
 # genome-build-id:         check http://mirbase.org/
 #
-# Hairpin precursor sequences have type "miRNA_primary_transcript".
-# Note, these sequences do not represent the full primary transcript,
-# rather a predicted stem-loop portion that includes the precursor
+# Hairpin precursor sequences have type "miRNA_primary_transcript".
+# Note, these sequences do not represent the full primary transcript,
+# rather a predicted stem-loop portion that includes the precursor
 # miRNA. Mature sequences have type "miRNA".
 #
 dme-bantam	.	miRNA_primary_transcript	1	81	.	+	.	ID=MI0000387;Alias=MI0000387;Name=dme-bantam
--- a/yac.py	Tue Jul 25 11:04:28 2017 -0400
+++ b/yac.py	Wed Jul 26 19:15:08 2017 -0400
@@ -2,11 +2,8 @@
 # yac = yet another clipper
 # v 1.2.1 - 23-08-2014 - Support FastQ output
 # v 1.1.0 - 23-08-2014 - argparse implementation
-# Usage yac.py  $input $output $adapter_to_clip $min $max $Nmode
 # Christophe Antoniewski <drosofff@gmail.com>

-import sys
-import string
 import argparse
 from itertools import islice

@@ -16,17 +13,23 @@
     the_parser.add_argument(
         '--input', action="store", nargs='+', help="input fastq files")
     the_parser.add_argument(
-        '--output', action="store", type=str, help="output, clipped fasta file")
+        '--output', action="store", type=str,
+        help="output, clipped fasta file")
     the_parser.add_argument(
-        '--output_format', action="store", type=str, help="output format, fasta or fastq")
+        '--output_format', action="store", type=str,
+        help="output format, fasta or fastq")
     the_parser.add_argument(
-        '--adapter_to_clip', action="store", type=str, help="adapter sequence to clip")
+        '--adapter_to_clip', action="store", type=str,
+        help="adapter sequence to clip")
     the_parser.add_argument(
-        '--min', action="store", type=int, help="minimal size of clipped sequence to keep")
+        '--min', action="store", type=int,
+        help="minimal size of clipped sequence to keep")
     the_parser.add_argument(
-        '--max', action="store", type=int, help="maximal size of clipped sequence to keep")
+        '--max', action="store", type=int,
+        help="maximal size of clipped sequence to keep")
     the_parser.add_argument('--Nmode', action="store", type=str, choices=[
-                            "accept", "reject"], help="accept or reject sequences with N for clipping")
+                            "accept", "reject"],
+                            help="accept or reject Ns in clipped sequences")
     args = the_parser.parse_args()
     args.adapter_to_clip = args.adapter_to_clip.upper()
     return args
@@ -34,7 +37,8 @@

 class Clip:

-    def __init__(self, inputfile, outputfile, output_format, adapter, minsize, maxsize, Nmode):
+    def __init__(self, inputfile, outputfile, output_format,
+                 adapter, minsize, maxsize, Nmode):
         self.inputfile = inputfile
         self.outputfile = outputfile
         self.output_format = output_format
@@ -44,9 +48,12 @@
         self.Nmode = Nmode

         def motives(sequence):
-            '''return a list of motives for perfect (6nt) or imperfect (7nt with one mismatch) search on import string module'''
+            '''
+            return a list of motives for perfect (6nt) or
+            imperfect (7nt with one mismatch) search on import string module
+            '''
             sequencevariants = [
-                sequence[0:6]]  # initializes the list with the 6mer perfect match
+                sequence[0:6]]  # initializes list with 6mer perfect match
             dicsubst = {"A": "TGCN", "T": "AGCN", "G": "TACN", "C": "GATN"}
             for pos in enumerate(sequence[:6]):
                 for subst in dicsubst[pos[1]]:
@@ -100,6 +107,7 @@
     instanceClip = Clip(*argv)
     instanceClip.handle_io()

+
 if __name__ == "__main__":
     args = Parser()
     id = 0