Mercurial > repos > petr-novak > profrep
comparison gff_selection.py @ 0:a5f1638b73be draft
Uploaded
author | petr-novak |
---|---|
date | Wed, 26 Jun 2019 08:01:42 -0400 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:a5f1638b73be |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 import argparse | |
4 | |
5 | |
6 def check_file_start(gff_file): | |
7 count_comment = 0 | |
8 with open(gff_file, "r") as gff_all: | |
9 line = gff_all.readline() | |
10 while line.startswith("#"): | |
11 line = gff_all.readline() | |
12 count_comment += 1 | |
13 return count_comment | |
14 | |
15 | |
16 def cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID): | |
17 ''' | |
18 Extract records for particular sequence and/or region from arbitrary GFF3 file | |
19 in form: original_seq_name:start-end (e.g. chr1:1000-2000) | |
20 Write a new GFF containing only records from this region | |
21 If new SEQ ID for extracted region is not provided, it will be named based on the region to cut | |
22 ! ALLOWS TO CUT ONLY ONE REGION AT A TIME | |
23 ''' | |
24 ## cut a particular part of a paritcular sequence | |
25 if ":" and "-" in REGION: | |
26 seq_to_cut = ":".join(REGION.split(":")[:-1]) | |
27 int_start = int(REGION.split(":")[-1].split("-")[0]) | |
28 int_end = int(REGION.split(":")[-1].split("-")[1]) | |
29 ## cut the whole sequence if region is not specified | |
30 else: | |
31 int_start = 0 | |
32 int_end = float("inf") | |
33 seq_to_cut = REGION | |
34 count_comment = check_file_start(GFF_IN) | |
35 with open(GFF_OUT, "w") as gff_out: | |
36 with open(GFF_IN, "r") as gff_in: | |
37 for comment_idx in range(count_comment): | |
38 next(gff_in) | |
39 gff_out.write("##gff-version 3\n") | |
40 gff_out.write("##sequence region {}\n".format(REGION)) | |
41 for line in gff_in: | |
42 if not line.startswith("#") and line.split("\t")[ | |
43 0] == seq_to_cut and int(float(line.split("\t")[ | |
44 3])) >= int_start and int(float(line.split("\t")[ | |
45 4])) <= int_end: | |
46 new_start = int(line.split("\t")[3]) - int_start + 1 | |
47 new_end = int(line.split("\t")[4]) - int_start + 1 | |
48 gff_out.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format( | |
49 NEW_SEQ_ID, line.split("\t")[1], line.split("\t")[ | |
50 2], new_start, new_end, line.split("\t")[ | |
51 5], line.split("\t")[6], line.split("\t")[ | |
52 7], line.split("\t")[8])) | |
53 | |
54 | |
55 def main(args): | |
56 # Command line arguments | |
57 GFF_IN = args.gff_input | |
58 GFF_OUT = args.gff_output | |
59 REGION = args.region | |
60 NEW_SEQ_ID = args.new_seq_id | |
61 | |
62 if GFF_OUT is None: | |
63 GFF_OUT = "{}_cut{}.gff3".format(GFF_IN, REGION) | |
64 | |
65 if not NEW_SEQ_ID: | |
66 NEW_SEQ_ID = REGION | |
67 | |
68 cut_region(GFF_IN, GFF_OUT, REGION, NEW_SEQ_ID) | |
69 | |
70 | |
71 if __name__ == "__main__": | |
72 | |
73 # Command line arguments | |
74 parser = argparse.ArgumentParser() | |
75 parser.add_argument('-gi', | |
76 '--gff_input', | |
77 type=str, | |
78 required=True, | |
79 help='choose gff file') | |
80 parser.add_argument( | |
81 '-go', '--gff_output', | |
82 type=str, help='choose gff file') | |
83 parser.add_argument('-si', '--new_seq_id', type=str, help=' ') | |
84 parser.add_argument('-rg', '--region', type=str, required=True, help=' ') | |
85 args = parser.parse_args() | |
86 main(args) |