diff gff_selection.py @ 0:a5f1638b73be draft

Uploaded
author petr-novak
date Wed, 26 Jun 2019 08:01:42 -0400
parents
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gff_selection.py	Wed Jun 26 08:01:42 2019 -0400
@@ -0,0 +1,86 @@
+#!/usr/bin/env python3
+
+import argparse
+
+
+def check_file_start(gff_file):
+    count_comment = 0
+    with open(gff_file, "r") as gff_all:
+        line = gff_all.readline()
+        while line.startswith("#"):
+            line = gff_all.readline()
+            count_comment += 1
+    return count_comment
+
+
+def cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID):
+    '''
+	Extract records for particular sequence and/or region from arbitrary GFF3 file
+	in form: original_seq_name:start-end (e.g. chr1:1000-2000)
+	Write a new GFF containing only records from this region
+	If new SEQ ID for extracted region is not provided, it will be named based on the region to cut
+	! ALLOWS TO CUT ONLY ONE REGION AT A TIME
+	'''
+    ## cut a particular part of a paritcular sequence
+    if ":" and "-" in REGION:
+        seq_to_cut = ":".join(REGION.split(":")[:-1])
+        int_start = int(REGION.split(":")[-1].split("-")[0])
+        int_end = int(REGION.split(":")[-1].split("-")[1])
+    ## cut the whole sequence if region is not specified
+    else:
+        int_start = 0
+        int_end = float("inf")
+        seq_to_cut = REGION
+    count_comment = check_file_start(GFF_IN)
+    with open(GFF_OUT, "w") as gff_out:
+        with open(GFF_IN, "r") as gff_in:
+            for comment_idx in range(count_comment):
+                next(gff_in)
+            gff_out.write("##gff-version 3\n")
+            gff_out.write("##sequence region {}\n".format(REGION))
+            for line in gff_in:
+                if not line.startswith("#") and line.split("\t")[
+                        0] == seq_to_cut and int(float(line.split("\t")[
+                            3])) >= int_start and int(float(line.split("\t")[
+                                4])) <= int_end:
+                    new_start = int(line.split("\t")[3]) - int_start + 1
+                    new_end = int(line.split("\t")[4]) - int_start + 1
+                    gff_out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
+                        NEW_SEQ_ID, line.split("\t")[1], line.split("\t")[
+                            2], new_start, new_end, line.split("\t")[
+                                5], line.split("\t")[6], line.split("\t")[
+                                    7], line.split("\t")[8]))
+
+
+def main(args):
+    # Command line arguments
+    GFF_IN = args.gff_input
+    GFF_OUT = args.gff_output
+    REGION = args.region
+    NEW_SEQ_ID = args.new_seq_id
+
+    if GFF_OUT is None:
+        GFF_OUT = "{}_cut{}.gff3".format(GFF_IN, REGION)
+
+    if not NEW_SEQ_ID:
+        NEW_SEQ_ID = REGION
+
+    cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID)
+
+
+if __name__ == "__main__":
+
+    # Command line arguments
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-gi',
+                        '--gff_input',
+                        type=str,
+                        required=True,
+                        help='choose gff file')
+    parser.add_argument(
+        '-go', '--gff_output',
+        type=str, help='choose gff file')
+    parser.add_argument('-si', '--new_seq_id', type=str, help=' ')
+    parser.add_argument('-rg', '--region', type=str, required=True, help=' ')
+    args = parser.parse_args()
+    main(args)