Mercurial > repos > earlhaminst > gstf_preparation
annotate gstf_preparation.py @ 15:9c62ad7dd113 draft default tip
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
author | earlhaminst |
---|---|
date | Thu, 29 Apr 2021 19:46:38 +0000 |
parents | 598e9172b8e7 |
children |
rev | line source |
---|---|
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
1 import json |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
2 import optparse |
10
e8e75a79de59
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents:
9
diff
changeset
|
3 import os |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
4 import sqlite3 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
5 import sys |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
6 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
7 version = "0.5.0" |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
8 gene_count = 0 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
9 |
8
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
10 |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
11 def asbool(val): |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
12 if isinstance(val, str): |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
13 val_lower = val.strip().lower() |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
14 if val_lower in ('true', '1'): |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
15 return True |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
16 elif val_lower in ('false', '0'): |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
17 return False |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
18 else: |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
19 raise ValueError(f"Cannot convert {val} to bool") |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
20 else: |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
21 return bool(val) |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
22 |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
23 |
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
24 class Sequence: |
8
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
25 def __init__(self, header, sequence_parts): |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
26 self.header = header |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
27 self.sequence_parts = sequence_parts |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
28 self._sequence = None |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
29 |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
30 @property |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
31 def sequence(self): |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
32 if self._sequence is None: |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
33 self._sequence = ''.join(self.sequence_parts) |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
34 return self._sequence |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
35 |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
36 def print(self, fh=sys.stdout): |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
37 print(self.header, file=fh) |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
38 for line in self.sequence_parts: |
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
39 print(line, file=fh) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
40 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
41 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
42 def FASTAReader_gen(fasta_filename): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
43 with open(fasta_filename) as fasta_file: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
44 line = fasta_file.readline() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
45 while True: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
46 if not line: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
47 return |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
48 assert line.startswith('>'), "FASTA headers must start with >" |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
49 header = line.rstrip() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
50 sequence_parts = [] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
51 line = fasta_file.readline() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
52 while line and line[0] != '>': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
53 sequence_parts.append(line.rstrip()) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
54 line = fasta_file.readline() |
8
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
55 yield Sequence(header, sequence_parts) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
56 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
57 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
58 def create_tables(conn): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
59 cur = conn.cursor() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
60 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
61 cur.execute('''CREATE TABLE meta ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
62 version VARCHAR PRIMARY KEY NOT NULL)''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
63 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
64 cur.execute('INSERT INTO meta (version) VALUES (?)', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
65 (version, )) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
66 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
67 cur.execute('''CREATE TABLE gene ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
68 gene_id VARCHAR PRIMARY KEY NOT NULL, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
69 gene_symbol VARCHAR, |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
70 seq_region_name VARCHAR NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
71 seq_region_start INTEGER NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
72 seq_region_end INTEGER NOT NULL, |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
73 seq_region_strand INTEGER NOT NULL, |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
74 species VARCHAR NOT NULL, |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
75 biotype VARCHAR, |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
76 gene_json VARCHAR NOT NULL)''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
77 cur.execute('CREATE INDEX gene_symbol_index ON gene (gene_symbol)') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
78 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
79 cur.execute('''CREATE TABLE transcript ( |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
80 transcript_id VARCHAR PRIMARY KEY NOT NULL, |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
81 transcript_symbol VARCHAR, |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
82 protein_id VARCHAR UNIQUE, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
83 protein_sequence VARCHAR, |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
84 biotype VARCHAR, |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
85 is_canonical BOOLEAN NOT NULL DEFAULT FALSE, |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
86 gene_id VARCHAR NOT NULL REFERENCES gene(gene_id))''') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
87 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
88 # The following temporary view is not used in GAFA, so schema changes to it |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
89 # don't require a meta version upgrade. |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
90 cur.execute('''CREATE TEMPORARY VIEW transcript_join_gene AS |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
91 SELECT transcript_id, transcript_symbol, COALESCE(transcript.biotype, gene.biotype) AS biotype, is_canonical, gene_id, gene_symbol, seq_region_name, species |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
92 FROM transcript JOIN gene |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
93 USING (gene_id)''') |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
94 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
95 conn.commit() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
96 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
97 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
98 def fetch_transcript_and_gene(conn, transcript_id): |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
99 cur = conn.cursor() |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
100 |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
101 cur.execute('SELECT * FROM transcript_join_gene WHERE transcript_id=?', |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
102 (transcript_id, )) |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
103 return cur.fetchone() |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
104 |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
105 |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
106 def remove_type_from_list_of_ids(ids): |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
107 return ','.join(remove_type_from_id(id_) for id_ in ids.split(',')) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
108 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
109 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
110 def remove_type_from_id(id_): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
111 colon_index = id_.find(':') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
112 if colon_index >= 0: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
113 return id_[colon_index + 1:] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
114 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
115 return id_ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
116 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
117 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
118 def feature_to_dict(cols, parent_dict=None): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
119 d = { |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
120 'end': int(cols[4]), |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
121 'start': int(cols[3]), |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
122 } |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
123 for attr in cols[8].split(';'): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
124 if '=' in attr: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
125 (tag, value) = attr.split('=') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
126 if tag == 'ID': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
127 tag = 'id' |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
128 value = remove_type_from_id(value) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
129 elif tag == 'Parent': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
130 value = remove_type_from_list_of_ids(value) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
131 elif tag == 'representative': |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
132 tag = 'is_canonical' |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
133 d[tag] = value |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
134 if cols[6] == '+': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
135 d['strand'] = 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
136 elif cols[6] == '-': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
137 d['strand'] = -1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
138 else: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
139 raise Exception(f"Unrecognized strand: {cols[6]}") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
140 if parent_dict is not None and 'Parent' in d: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
141 # a 3' UTR can be split among multiple exons |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
142 # a 5' UTR can be split among multiple exons |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
143 # a CDS can be part of multiple transcripts |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
144 for parent in d['Parent'].split(','): |
10
e8e75a79de59
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents:
9
diff
changeset
|
145 parent_dict.setdefault(parent, []).append(d) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
146 return d |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
147 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
148 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
149 def add_gene_to_dict(cols, species, gene_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
150 global gene_count |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
151 gene = feature_to_dict(cols) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
152 if not gene['id']: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
153 raise Exception(f"Id not found among column 9 attribute tags: {cols[8]}") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
154 gene.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
155 'member_id': gene_count, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
156 'object_type': 'Gene', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
157 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
158 'species': species, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
159 'Transcript': [], |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
160 'display_name': gene.get('Name'), |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
161 }) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
162 gene_dict[gene['id']] = gene |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
163 gene_count = gene_count + 1 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
164 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
165 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
166 def add_transcript_to_dict(cols, species, transcript_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
167 transcript = feature_to_dict(cols) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
168 transcript.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
169 'object_type': 'Transcript', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
170 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
171 'species': species, |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
172 'display_name': transcript.get('Name'), |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
173 }) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
174 transcript_dict[transcript['id']] = transcript |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
175 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
176 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
177 def add_exon_to_dict(cols, species, exon_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
178 exon = feature_to_dict(cols, exon_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
179 exon.update({ |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
180 'length': int(cols[4]) - int(cols[3]) + 1, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
181 'object_type': 'Exon', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
182 'seq_region_name': cols[0], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
183 'species': species, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
184 }) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
185 if 'id' not in exon and 'Name' in exon: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
186 exon['id'] = exon['Name'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
187 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
188 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
189 def add_cds_to_dict(cols, cds_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
190 cds = feature_to_dict(cols, cds_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
191 if 'id' not in cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
192 if 'Name' in cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
193 cds['id'] = cds['Name'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
194 elif 'Parent' in cds and ',' not in cds['Parent']: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
195 cds['id'] = cds['Parent'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
196 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
197 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
198 def join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
199 for parent, exon_list in exon_parent_dict.items(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
200 if parent in transcript_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
201 exon_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
202 transcript_dict[parent]['Exon'] = exon_list |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
203 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
204 for transcript_id, transcript in transcript_dict.items(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
205 translation = { |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
206 'CDS': [], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
207 'id': None, |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
208 'end': transcript['end'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
209 'object_type': 'Translation', |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
210 'species': transcript['species'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
211 'start': transcript['start'], |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
212 } |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
213 found_cds = False |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
214 derived_translation_start = None |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
215 derived_translation_end = None |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
216 if transcript_id in cds_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
217 cds_list = cds_parent_dict[transcript_id] |
14
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
218 unique_cds_ids = {cds['id'] for cds in cds_list} |
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
219 if len(unique_cds_ids) > 1: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
220 msg = f"""Found multiple CDS IDs ({unique_cds_ids}) for transcript '{transcript_id}'. |
14
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
221 This is not supported by the Ensembl JSON format. If a CDS is split across |
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
222 multiple discontinuous genomic locations, the GFF3 standard requires that all |
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
223 corresponding lines use the same ID attribute.""" |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
224 raise Exception(msg) |
14
598e9172b8e7
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit ea67c766934266e690d05e3f9ebb4cca12b8e3e7"
earlhaminst
parents:
13
diff
changeset
|
225 cds_id = unique_cds_ids.pop() |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
226 translation['id'] = cds_id |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
227 cds_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
228 translation['CDS'] = cds_list |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
229 translation['start'] = cds_list[0]['start'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
230 translation['end'] = cds_list[-1]['end'] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
231 found_cds = True |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
232 if transcript_id in five_prime_utr_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
233 five_prime_utr_list = five_prime_utr_parent_dict[transcript_id] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
234 five_prime_utr_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
235 if transcript['strand'] == 1: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
236 derived_translation_start = five_prime_utr_list[-1]['end'] + 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
237 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
238 derived_translation_end = five_prime_utr_list[0]['start'] - 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
239 if transcript_id in three_prime_utr_parent_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
240 three_prime_utr_list = three_prime_utr_parent_dict[transcript_id] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
241 three_prime_utr_list.sort(key=lambda _: _['start']) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
242 if transcript['strand'] == 1: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
243 derived_translation_end = three_prime_utr_list[0]['start'] - 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
244 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
245 derived_translation_start = three_prime_utr_list[-1]['end'] + 1 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
246 if derived_translation_start is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
247 if found_cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
248 if derived_translation_start > translation['start']: |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
249 raise Exception(f"Transcript {transcript_id} has the start of CDS {cds_id} overlapping with the UTR end") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
250 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
251 translation['start'] = derived_translation_start |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
252 if derived_translation_end is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
253 if found_cds: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
254 if derived_translation_end < translation['end']: |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
255 raise Exception(f"Transcript {transcript_id} has the end of CDS {cds_id} overlapping with the UTR start") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
256 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
257 translation['end'] = derived_translation_end |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
258 if found_cds or derived_translation_start is not None or derived_translation_end is not None: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
259 transcript['Translation'] = translation |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
260 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
261 for transcript in transcript_dict.values(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
262 if 'Parent' in transcript: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
263 # A polycistronic transcript can have multiple parents |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
264 for parent in transcript['Parent'].split(','): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
265 if parent in gene_dict: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
266 gene_dict[parent]['Transcript'].append(transcript) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
267 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
268 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
269 def write_gene_dict_to_db(conn, gene_dict): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
270 cur = conn.cursor() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
271 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
272 for gene in gene_dict.values(): |
3
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
273 if gene is None: |
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
274 # This can happen when loading a JSON file from Ensembl |
7e11a7f4bdba
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 77ed525b753e34d3e9dd4f07a239592ce764f7e6-dirty
earlhaminst
parents:
1
diff
changeset
|
275 continue |
12
99bae410128c
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents:
11
diff
changeset
|
276 if 'confidence' in gene and gene['confidence'].lower() != 'high': |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
277 print("Gene {} has confidence {} (not high), discarding".format(gene['id'], gene['confidence']), file=sys.stderr) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
278 continue |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
279 gene_id = gene['id'] |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
280 cur.execute('INSERT INTO gene (gene_id, gene_symbol, seq_region_name, seq_region_start, seq_region_end, seq_region_strand, species, biotype, gene_json) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
281 (gene_id, gene.get('display_name'), gene['seq_region_name'], gene['start'], gene['end'], gene['strand'], gene['species'], gene.get('biotype'), json.dumps(gene))) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
282 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
283 if "Transcript" in gene: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
284 for transcript in gene["Transcript"]: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
285 transcript_id = transcript['id'] |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
286 transcript_symbol = transcript.get('display_name') |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
287 protein_id = transcript.get('Translation', {}).get('id') |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
288 biotype = transcript.get('biotype') |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
289 is_canonical = asbool(transcript.get('is_canonical', False)) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
290 to_insert = (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
291 try: |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
292 cur.execute('INSERT INTO transcript (transcript_id, transcript_symbol, protein_id, biotype, is_canonical, gene_id) VALUES (?, ?, ?, ?, ?, ?)', |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
293 to_insert) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
294 except Exception as e: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
295 raise Exception(f"Error while inserting {to_insert} into transcript table: {e}") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
296 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
297 conn.commit() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
298 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
299 |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
300 def remove_id_version(s, force=False): |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
301 """ |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
302 Remove the optional '.VERSION' from an id if it's an Ensembl id or if |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
303 `force` is True. |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
304 """ |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
305 if force or s.startswith('ENS'): |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
306 return s.split('.')[0] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
307 else: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
308 return s |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
309 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
310 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
311 def __main__(): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
312 parser = optparse.OptionParser() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
313 parser.add_option('--gff3', action='append', default=[], help='GFF3 file to convert, in SPECIES:FILENAME format. Use multiple times to add more files') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
314 parser.add_option('--json', action='append', default=[], help='JSON file to merge. Use multiple times to add more files') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
315 parser.add_option('--fasta', action='append', default=[], help='Path of the input FASTA files') |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
316 parser.add_option('--filter', type='choice', choices=['canonical', 'coding', ''], default='', help='Which transcripts to keep') |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
317 parser.add_option('--headers', type='choice', |
12
99bae410128c
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents:
11
diff
changeset
|
318 choices=['TranscriptId_species', 'TranscriptID-GeneSymbol_species', 'TranscriptID-TranscriptSymbol_species', ''], |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
319 default='', help='Change the header line of the FASTA sequences to this format') |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
320 parser.add_option('--regions', default="", help='Comma-separated list of region IDs for which FASTA sequences should be filtered') |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
321 parser.add_option('-o', '--output', help='Path of the output SQLite file') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
322 parser.add_option('--of', help='Path of the output FASTA file') |
10
e8e75a79de59
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9c8611fee927883f50bc6955771aa69df1ce8457"
earlhaminst
parents:
9
diff
changeset
|
323 parser.add_option('--ff', default=os.devnull, help='Path of the filtered sequences output FASTA file') |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
324 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
325 options, args = parser.parse_args() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
326 if args: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
327 raise Exception('Use options to provide inputs') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
328 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
329 conn = sqlite3.connect(options.output) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
330 conn.row_factory = sqlite3.Row |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
331 conn.execute('PRAGMA foreign_keys = ON') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
332 create_tables(conn) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
333 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
334 for gff3_arg in options.gff3: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
335 try: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
336 (species, filename) = gff3_arg.split(':') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
337 except ValueError: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
338 raise Exception(f"Argument for --gff3 '{gff3_arg}' is not in the SPECIES:FILENAME format") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
339 gene_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
340 transcript_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
341 exon_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
342 cds_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
343 five_prime_utr_parent_dict = dict() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
344 three_prime_utr_parent_dict = dict() |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
345 unimplemented_feature_nlines_dict = dict() |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
346 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
347 with open(filename) as f: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
348 for i, line in enumerate(f, start=1): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
349 line = line.strip() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
350 if not line: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
351 # skip empty lines |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
352 continue |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
353 if line[0] == '#': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
354 # skip comment lines |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
355 continue |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
356 cols = line.split('\t') |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
357 if len(cols) != 9: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
358 raise Exception(f"Line {i} in file '{filename}': '{line}' does not have 9 columns") |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
359 feature_type = cols[2] |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
360 try: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
361 if feature_type == 'gene': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
362 add_gene_to_dict(cols, species, gene_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
363 elif feature_type in ('mRNA', 'transcript'): |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
364 add_transcript_to_dict(cols, species, transcript_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
365 elif feature_type == 'exon': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
366 add_exon_to_dict(cols, species, exon_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
367 elif feature_type == 'five_prime_UTR': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
368 feature_to_dict(cols, five_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
369 elif feature_type == 'three_prime_UTR': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
370 feature_to_dict(cols, three_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
371 elif feature_type == 'CDS': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
372 add_cds_to_dict(cols, cds_parent_dict) |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
373 elif feature_type in unimplemented_feature_nlines_dict: |
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
374 unimplemented_feature_nlines_dict[feature_type] += 1 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
375 else: |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
376 unimplemented_feature_nlines_dict[feature_type] = 0 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
377 except Exception as e: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
378 print(f"Line {i} in file '{filename}': {e}", file=sys.stderr) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
379 |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
380 for unimplemented_feature, nlines in unimplemented_feature_nlines_dict.items(): |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
381 print(f"Skipped {nlines} lines in GFF3 file '{filename}': '{unimplemented_feature}' is not an implemented feature type", file=sys.stderr) |
5
b3ba0c84667c
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 95bab1105cf8a7b07c668f08f712399e8775a4ae
earlhaminst
parents:
4
diff
changeset
|
382 |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
383 join_dicts(gene_dict, transcript_dict, exon_parent_dict, cds_parent_dict, five_prime_utr_parent_dict, three_prime_utr_parent_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
384 write_gene_dict_to_db(conn, gene_dict) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
385 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
386 for json_arg in options.json: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
387 with open(json_arg) as f: |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
388 write_gene_dict_to_db(conn, json.load(f)) |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
389 |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
390 # Read the FASTA files a first time to: |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
391 # - determine for each file if we need to force the removal of the version |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
392 # from the transcript id |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
393 # - fill gene_transcripts_dict when keeping only the canonical transcripts |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
394 force_remove_id_version_file_list = [] |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
395 gene_transcripts_dict = dict() |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
396 for fasta_arg in options.fasta: |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
397 force_remove_id_version = False |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
398 found_gene_transcript = False |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
399 for entry in FASTAReader_gen(fasta_arg): |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
400 # Extract the transcript id by removing everything after the first space and then removing the version if needed |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
401 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
402 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
403 transcript = fetch_transcript_and_gene(conn, transcript_id) |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
404 if not transcript and not found_gene_transcript: |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
405 # We have not found a proper gene transcript in this file yet, |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
406 # try to force the removal of the version from the transcript id |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
407 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], True) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
408 transcript = fetch_transcript_and_gene(conn, transcript_id) |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
409 # Remember that we need to force the removal for this file |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
410 if transcript: |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
411 force_remove_id_version = True |
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
412 force_remove_id_version_file_list.append(fasta_arg) |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
413 print(f"Forcing removal of id version in FASTA file '{fasta_arg}'", file=sys.stderr) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
414 if not transcript: |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
415 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
416 continue |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
417 if options.filter != 'canonical': |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
418 break |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
419 found_gene_transcript = True |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
420 |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
421 if len(entry.sequence) % 3 != 0: |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
422 continue |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
423 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
424 if transcript_biotype and transcript_biotype != 'protein_coding': |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
425 continue |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
426 gene_transcripts_dict.setdefault(transcript['gene_id'], []).append((transcript_id, transcript['is_canonical'], len(entry.sequence))) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
427 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
428 if options.filter == 'canonical': |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
429 selected_transcript_ids = [] |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
430 for gene_id, transcript_tuples in gene_transcripts_dict.items(): |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
431 canonical_transcript_ids = [id_ for (id_, is_canonical, _) in transcript_tuples if is_canonical] |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
432 if not canonical_transcript_ids: |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
433 # Select the transcript with the longest sequence. If more than |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
434 # one transcripts have the same longest sequence for a gene, the |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
435 # first one to appear in the FASTA file is selected. |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
436 selected_transcript_id = max(transcript_tuples, key=lambda transcript_tuple: transcript_tuple[2])[0] |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
437 elif len(canonical_transcript_ids) > 1: |
15
9c62ad7dd113
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit a4e49466bf746633ddc26d052b65ca41576d18fb"
earlhaminst
parents:
14
diff
changeset
|
438 raise Exception(f"Gene {gene_id} has more than 1 canonical transcripts") |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
439 else: |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
440 selected_transcript_id = canonical_transcript_ids[0] |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
441 selected_transcript_ids.append(selected_transcript_id) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
442 |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
443 regions = [_.strip().lower() for _ in options.regions.split(",")] |
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
444 with open(options.of, 'w') as output_fasta_file, open(options.ff, 'w') as filtered_fasta_file: |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
445 for fasta_arg in options.fasta: |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
446 force_remove_id_version = fasta_arg in force_remove_id_version_file_list |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
447 for entry in FASTAReader_gen(fasta_arg): |
9
f4acbfe8d6fe
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 2f56285b1ef694d732c8b2637e3e924f8a626e55
earlhaminst
parents:
8
diff
changeset
|
448 transcript_id = remove_id_version(entry.header[1:].lstrip().split(' ')[0], force_remove_id_version) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
449 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
450 transcript = fetch_transcript_and_gene(conn, transcript_id) |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
451 if not transcript: |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
452 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' not found in the gene feature information", file=sys.stderr) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
453 continue |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
454 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
455 if options.filter == 'canonical': |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
456 # We already filtered out non-protein-coding transcripts when populating gene_transcripts_dict |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
457 if transcript_id not in selected_transcript_ids: |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
458 continue |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
459 elif options.filter == 'coding': |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
460 if len(entry.sequence) % 3 != 0: |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
461 print(f"Transcript '{transcript_id}' in FASTA file '{fasta_arg}' has a coding sequence length which is not multiple of 3, removing from FASTA output", file=sys.stderr) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
462 continue |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
463 transcript_biotype = transcript['biotype'] # This is the biotype of the transcript or, if that is NULL, the one of the gene |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
464 if transcript_biotype and transcript_biotype != 'protein_coding': |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
465 print(f"Transcript {transcript_id} has biotype {transcript_biotype} (not protein-coding), removing from FASTA output", file=sys.stderr) |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
466 continue |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
467 |
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
468 if options.headers == "TranscriptId_species": |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
469 # Change the FASTA header to '>TranscriptId_species', as required by TreeBest |
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
470 # Remove any underscore in the species |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
471 entry.header = ">{}_{}".format(transcript_id, transcript['species'].replace('_', '')) |
12
99bae410128c
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents:
11
diff
changeset
|
472 elif options.headers == "TranscriptID-GeneSymbol_species": |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
473 # Remove any underscore in the species |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
474 entry.header = ">{}-{}_{}".format(transcript_id, transcript['gene_symbol'], transcript['species'].replace('_', '')) |
12
99bae410128c
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 4579d0c461c30183a3092d84013e30f53f072ca1-dirty"
earlhaminst
parents:
11
diff
changeset
|
475 elif options.headers == "TranscriptID-TranscriptSymbol_species": |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
476 # Remove any underscore in the species |
13
51a7a2a82902
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 9178f870760132962f8d3a26ea55c201880bb018-dirty"
earlhaminst
parents:
12
diff
changeset
|
477 entry.header = ">{}-{}_{}".format(transcript_id, transcript['transcript_symbol'], transcript['species'].replace('_', '')) |
4
284f64ad9d43
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit cda3ecab1a34376cc7d4d392a34dc810847cbf0b-dirty
earlhaminst
parents:
3
diff
changeset
|
478 |
11
dbe37a658cd2
"planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 133a11e7195f8da83c5b661d8babb3f6d9e18812"
earlhaminst
parents:
10
diff
changeset
|
479 if transcript['seq_region_name'].lower() in regions: |
8
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
480 entry.print(filtered_fasta_file) |
6
56bbdbfe3eaa
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit fa875eea77a9471acada2b7b8882a0467994c960
earlhaminst
parents:
5
diff
changeset
|
481 else: |
8
92f3966d5bc3
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 88ba62ae8c3d9587a0015c72209242ad0c1df0c2
earlhaminst
parents:
6
diff
changeset
|
482 entry.print(output_fasta_file) |
0
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
483 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
484 conn.close() |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
485 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
486 |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
487 if __name__ == '__main__': |
28879ca33b5f
planemo upload for repository https://github.com/TGAC/earlham-galaxytools/tree/master/tools/gstf_preparation commit 651fae48371f845578753052c6fe173e3bb35670
earlhaminst
parents:
diff
changeset
|
488 __main__() |