Mercurial > repos > rmarenco > hubarchivecreator
annotate Gtf.py @ 19:0152500d9acd draft
Uploaded
| author | rmarenco |
|---|---|
| date | Thu, 13 Oct 2016 22:49:01 -0400 |
| parents | c02720d1afee |
| children | 2677f1899aa8 |
| rev | line source |
|---|---|
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
1 #!/usr/bin/python |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
2 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
3 import os |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
4 import tempfile |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
5 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
6 # Internal dependencies |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
7 from Datatype import Datatype |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
8 from util import subtools |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
9 |
| 19 | 10 class InfoModifiedGtf(): |
| 11 def __init__(self, is_modified=False, array_modified_lines=[]): | |
| 12 self.is_modified = is_modified | |
| 13 self.array_modified_lines = array_modified_lines | |
| 14 | |
| 15 def get_str_modified_lines(self): | |
| 16 return ','.join(map(str, self.array_modified_lines)) | |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
17 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
18 class Gtf( Datatype ): |
|
10
acc233161f50
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1b1063f90004764bcf504f4340738eca5c4b1f9d
rmarenco
parents:
1
diff
changeset
|
19 def __init__( self, input_gtf_false_path, data_gtf): |
|
acc233161f50
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1b1063f90004764bcf504f4340738eca5c4b1f9d
rmarenco
parents:
1
diff
changeset
|
20 |
|
acc233161f50
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1b1063f90004764bcf504f4340738eca5c4b1f9d
rmarenco
parents:
1
diff
changeset
|
21 super(Gtf, self).__init__() |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
22 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
23 self.track = None |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
24 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
25 self.input_gtf_false_path = input_gtf_false_path |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
26 self.name_gtf = data_gtf["name"] |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
27 self.priority = data_gtf["order_index"] |
|
16
3233451a3bd6
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit fc73ec22a0db3ab09c4ac13dc58f0b54ae37845c
rmarenco
parents:
13
diff
changeset
|
28 self.track_color = data_gtf["track_color"] |
|
17
c02720d1afee
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1adbf397de1fc7af4d91e026093d7fff983e21cf
rmarenco
parents:
16
diff
changeset
|
29 # TODO: Think about how to avoid repetition of the group_name everywhere |
|
c02720d1afee
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1adbf397de1fc7af4d91e026093d7fff983e21cf
rmarenco
parents:
16
diff
changeset
|
30 self.group_name = data_gtf["group_name"] |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
31 |
|
11
d05236b15f81
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 3760d0c8353b924ecf994131a5c2eb381aa81fb2
rmarenco
parents:
10
diff
changeset
|
32 #print "Creating TrackHub GTF from (falsePath: %s; name: %s)" % ( self.input_gtf_false_path, self.name_gtf) |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
33 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
34 # TODO: See if we need these temporary files as part of the generated files |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
35 genePredFile = tempfile.NamedTemporaryFile(bufsize=0, suffix=".genePred") |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
36 unsorted_bigGenePred_file = tempfile.NamedTemporaryFile(bufsize=0, suffix=".unsorted.bigGenePred") |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
37 sorted_bigGenePred_file = tempfile.NamedTemporaryFile(suffix=".sortedBed.bigGenePred") |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
38 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
39 # GtfToGenePred |
| 19 | 40 ## Checking the integrity of the inputs |
| 41 modified_gtf = self._checkAndFixGtf() | |
| 42 | |
| 43 ## Processing the gtf | |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
44 subtools.gtfToGenePred(self.input_gtf_false_path, genePredFile.name) |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
45 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
46 # TODO: From there, refactor because common use with Gff3.py |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
47 # genePredToBigGenePred processing |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
48 subtools.genePredToBigGenePred(genePredFile.name, unsorted_bigGenePred_file.name) |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
49 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
50 # Sort processing |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
51 subtools.sort(unsorted_bigGenePred_file.name, sorted_bigGenePred_file.name) |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
52 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
53 # bedToBigBed processing |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
54 trackName = "".join( ( self.name_gtf, ".bb") ) |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
55 |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
56 auto_sql_option = os.path.join(self.tool_directory, 'bigGenePred.as') |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
57 |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
58 myBigBedFilePath = os.path.join(self.myTrackFolderPath, trackName) |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
59 |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
60 with open(myBigBedFilePath, 'w') as bigBedFile: |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
61 subtools.bedToBigBed(sorted_bigGenePred_file.name, |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
62 self.chromSizesFile.name, |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
63 bigBedFile.name, |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
64 autoSql=auto_sql_option, |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
65 typeOption='bed12+8', |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
66 tab=True) |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
67 |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
68 |
|
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
69 # Create the Track Object |
|
11
d05236b15f81
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 3760d0c8353b924ecf994131a5c2eb381aa81fb2
rmarenco
parents:
10
diff
changeset
|
70 self.createTrack(file_path=trackName, |
|
d05236b15f81
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 3760d0c8353b924ecf994131a5c2eb381aa81fb2
rmarenco
parents:
10
diff
changeset
|
71 track_name=trackName, |
|
13
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
72 long_label=self.name_gtf, track_type='bigGenePred', |
|
25809f699cb3
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 65ab931ef2b05a5acf06cbde3a746c94a0a0a4cb
rmarenco
parents:
11
diff
changeset
|
73 visibility='dense', priority=self.priority, |
|
16
3233451a3bd6
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit fc73ec22a0db3ab09c4ac13dc58f0b54ae37845c
rmarenco
parents:
13
diff
changeset
|
74 track_file=myBigBedFilePath, |
|
17
c02720d1afee
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1adbf397de1fc7af4d91e026093d7fff983e21cf
rmarenco
parents:
16
diff
changeset
|
75 track_color=self.track_color, |
|
c02720d1afee
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 1adbf397de1fc7af4d91e026093d7fff983e21cf
rmarenco
parents:
16
diff
changeset
|
76 group_name=self.group_name) |
|
1
fb5e60d4d18a
planemo upload for repository https://github.com/goeckslab/hub-archive-creator commit 64cfc08088d11f6818c1b4e5514ef9e67969eaff-dirty
rmarenco
parents:
diff
changeset
|
77 |
| 19 | 78 # TODO: Use Logging instead of print |
| 79 if modified_gtf.is_modified: | |
| 80 print("- Warning: Gtf %s created with a modified version of your Gtf because of start/end coordinates issues." | |
| 81 % self.name_gtf) | |
| 82 print("Here are the lines removed: " + modified_gtf.get_str_modified_lines()) | |
| 83 else: | |
| 84 print("- Gtf %s created" % self.name_gtf) | |
| 85 | |
| 86 def _checkAndFixGtf(self): | |
| 87 """ | |
| 88 Call _checkAndFixGtf, check the integrity of gtf file, | |
| 89 if coordinates exceed chromosome size, either removed the whole line(s) or truncated to the end of the scaffold | |
| 90 depending on the user choice | |
| 91 default: remove the whole line(s) | |
| 92 """ | |
| 93 # Set the boolean telling if we had to modify the file | |
| 94 modified_gtf = InfoModifiedGtf() | |
| 95 | |
| 96 # Create a temp gtf just in case we have issues | |
| 97 temp_gtf = tempfile.NamedTemporaryFile(bufsize=0, suffix=".gtf", delete=False) | |
| 98 | |
| 99 # TODO: Get the user choice and use it | |
| 100 # TODO: Check if the start > 0 and the end <= chromosome size | |
| 101 # Get the chrom.sizes into a dictionary to have a faster access | |
| 102 # TODO: Think about doing this in Datatype.py, so everywhere we have access to this read-only dictionary | |
| 103 dict_chrom_sizes = {} | |
| 104 with open(self.chromSizesFile.name, 'r') as chromSizes: | |
| 105 lines = chromSizes.readlines() | |
| 106 for line in lines: | |
| 107 fields = line.split() | |
| 108 # fields[1] should be the name of the scaffold | |
| 109 # fields[2] should be the size of the scaffold | |
| 110 # TODO: Ensure this is true for all lines | |
| 111 dict_chrom_sizes[fields[0]] = fields[1] | |
| 112 | |
| 113 # Parse the GTF and check each line using the chrom sizes dictionary | |
| 114 with open(temp_gtf.name, 'a+') as tmp: | |
| 115 with open(self.input_gtf_false_path, 'r') as gtf: | |
| 116 lines = gtf.readlines() | |
| 117 for index, line in enumerate(lines): | |
| 118 # If this is not a comment, we check the fields | |
| 119 if not line.startswith('#'): | |
| 120 fields = line.split() | |
| 121 # We are interested in fields[0] => Seqname (scaffold) | |
| 122 # We are interested in fields[3] => Start of the scaffold | |
| 123 # We are interested in fields[4] => End of the scaffold | |
| 124 scaffold_size = dict_chrom_sizes[fields[0]] | |
| 125 start_position = fields[3] | |
| 126 end_position = fields[4] | |
| 127 | |
| 128 if start_position > 0 and end_position <= scaffold_size: | |
| 129 # We are good, so we copy this line | |
| 130 tmp.write(line) | |
| 131 tmp.write(os.linesep) | |
| 132 | |
| 133 | |
| 134 # The sequence is not good, we are going to process it regarding the user choice | |
| 135 # TODO: Process the user choice | |
| 136 # By default, we are assuming the user choice is to remove the lines: We don't copy it | |
| 137 | |
| 138 # If we are here, it means the gtf has been modified | |
| 139 else: | |
| 140 # We save the line for the feedback to the user | |
| 141 modified_gtf.array_modified_lines.append(index + 1) | |
| 142 | |
| 143 if modified_gtf.is_modified is False: | |
| 144 modified_gtf.is_modified = True | |
| 145 else: | |
| 146 pass | |
| 147 else: | |
| 148 tmp.write(line) | |
| 149 tmp.write(os.linesep) | |
| 150 | |
| 151 # Once the process it completed, we just replace the path of the gtf | |
| 152 self.input_gtf_false_path = temp_gtf.name | |
| 153 | |
| 154 # TODO: Manage the issue with the fact the dataset is going to still exist on the disk because of delete=False | |
| 155 | |
| 156 return modified_gtf |
