annotate data_stores/kipper.py @ 1:5c5027485f7d draft

Uploaded correct file
author damion
date Sun, 09 Aug 2015 16:07:50 -0400
parents
children 269d246ce6d0
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1 #!/usr/bin/python
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
2 # -*- coding: utf-8 -*-
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
3
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
4 import subprocess
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
5 import datetime
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
6 import dateutil.parser as parser2
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
7 import calendar
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
8 import optparse
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
9 import re
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
10 import os
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
11 import sys
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
12 from shutil import copy
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
13 import tempfile
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
14 import json
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
15 import glob
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
16 import gzip
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
17
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
18
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
19
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
20
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
21 CODE_VERSION = '1.0.0'
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
22 REGEX_NATURAL_SORT = re.compile('([0-9]+)')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
23 KEYDB_LIST = 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
24 KEYDB_EXTRACT = 2
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
25 KEYDB_REVERT = 3
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
26 KEYDB_IMPORT = 4
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
27
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
28 class MyParser(optparse.OptionParser):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
29 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
30 Provides a better class for displaying formatted help info.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
31 From http://stackoverflow.com/questions/1857346/python-optparse-how-to-include-additional-info-in-usage-output.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
32 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
33 def format_epilog(self, formatter):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
34 return self.epilog
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
35
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
36 def stop_err( msg ):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
37 sys.stderr.write("%s\n" % msg)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
38 sys.exit(1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
39
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
40 class Kipper(object):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
41
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
42
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
43 def __init__(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
44 # Provide defaults
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
45 self.db_master_file_name = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
46 self.db_master_file_path = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
47 self.metadata_file_path = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
48 self.db_import_file_path = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
49 self.output_file = None # By default, printed to stdout
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
50 self.volume_id = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
51 self.version_id = None # Note, this is natural #, starts from 1;
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
52 self.metadata = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
53 self.options = None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
54 self.compression = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
55
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
56 _nowabout = datetime.datetime.utcnow()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
57 self.dateTime = long(_nowabout.strftime("%s"))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
58
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
59 self.delim = "\t"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
60 self.nl = "\n"
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
61
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
62
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
63 def __main__(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
64 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
65 Handles all command line options for creating kipper archives, and extracting or reverting to a version.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
66 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
67 options, args = self.get_command_line()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
68 self.options = options
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
69
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
70 if options.code_version:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
71 print CODE_VERSION
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
72 return CODE_VERSION
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
73
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
74 # *********************** Get Master kipper file ***********************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
75 if not len(args):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
76 stop_err('A Kipper database file name needs to be included as first parameter!')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
77
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
78 self.db_master_file_name = args[0] #accepts relative path with file name
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
79
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
80 self.db_master_file_path = self.check_folder(self.db_master_file_name, "Kipper database file")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
81 # db_master_file_path is used from now on; db_master_file_name is used just for metadata labeling.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
82 # Adjust it to remove any relative path component.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
83 self.db_master_file_name = os.path.basename(self.db_master_file_name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
84
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
85 if os.path.isdir(self.db_master_file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
86 stop_err('Error: Kipper data file "%s" is actually a folder!' % (self.db_master_file_path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
87
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
88 self.metadata_file_path = self.db_master_file_path + '.md'
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
89
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
90 # Returns path but makes sure its folder is real. Must come before get_metadata()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
91 self.output_file = self.check_folder(options.db_output_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
92
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
93
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
94 # ************************* Get Metadata ******************************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
95 if options.initialize:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
96 if options.compression:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
97 self.compression = options.compression
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
98
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
99 self.set_metadata(type=options.initialize, compression=self.compression)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
100
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
101 self.get_metadata(options);
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
102
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
103 self.check_date_input(options)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
104
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
105 if options.version_id or (options.extract and options.version_index):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
106 if options.version_index:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
107 vol_ver = self.version_lookup(options.version_index)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
108
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
109 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
110 # Note version_id info overrides any date input above.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
111 vol_ver = self.get_version(options.version_id)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
112
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
113 if not vol_ver:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
114 stop_err("Error: Given version number or name does not exist in this database")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
115
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
116 (volume, version) = vol_ver
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
117 self.volume_id = volume['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
118 self.version_id = version['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
119 self.dateTime = float(version['created'])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
120 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
121 # Use latest version by default
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
122 if not self.version_id and len(self.metadata['volumes'][-1]['versions']) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
123 self.volume_id = self.metadata['volumes'][-1]['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
124 self.version_id = self.metadata['volumes'][-1]['versions'][-1]['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
125
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
126 # ************************** Action triggers **************************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
127
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
128 if options.volume == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
129 # Add a new volume to the metadata
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
130 self.metadata_create_volume()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
131 self.write_metadata(self.metadata)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
132
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
133 if options.db_import_file_path != None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
134 # Any time an import file is specified, this is the only action:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
135 self.try_import_file(options)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
136 return
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
137
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
138 if options.metadata == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
139 # Writes metadata to disk or stdout
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
140 self.write_metadata2(self.metadata)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
141 return
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
142
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
143 if options.extract == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
144 # Defaults to pulling latest version
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
145 if not (self.version_id):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
146 stop_err('Error: Please supply a version id (-n [number]) or date (-d [date]) to extract.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
147
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
148 if self.output_file and os.path.isdir(self.output_file):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
149 # A general output file name for the data store as a whole
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
150 output_name = self.metadata['file_name']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
151 if output_name == '':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
152 # Get output file name from version's original import file_name
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
153 output_name = self.metadata['volumes'][self.volume_id-1]['versions'][self.version_id-1]['file_name']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
154 # But remove the .gz suffix if it is there (refactor later).
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
155 if output_name[-3:] == '.gz':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
156 output_name = output_name[0:-3]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
157 self.output_file = os.path.join(self.output_file, output_name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
158
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
159 self.db_scan_action(KEYDB_EXTRACT)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
160 return
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
161
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
162 if options.revert == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
163 if not (options.version_id or options.dateTime or options.unixTime):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
164 stop_err('Error: Please supply a version id (-n [number]) or date (-d [date]) to revert to.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
165
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
166 # Send database back to given revision
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
167 if self.output_file and self.output_file == os.path.dirname(self.db_master_file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
168 self.output_file = self.get_db_path()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
169 self.db_scan_action(KEYDB_REVERT)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
170 return
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
171
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
172 # Default to list datastore versions
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
173 self.get_list()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
174
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
175
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
176 def get_db_path(self, volume_id = None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
177 #Note: metadata must be established before this method is called.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
178 if volume_id is None: volume_id = self.volume_id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
179 return self.db_master_file_path + '_' + str(volume_id) + self.metadata['compression']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
180
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
181
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
182 def get_temp_output_file(self, action = None, path=None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
183 # Returns write handle (+name) of temp file. Returns gzip interface if compression is on.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
184 if path == None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
185 path = self.output_file
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
186
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
187 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False, dir=os.path.dirname(path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
188
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
189 # If compression is called for, then we have to switch to gzip handler on the temp name:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
190 if action in [KEYDB_REVERT, KEYDB_IMPORT] and self.metadata['compression'] == '.gz':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
191 temp.close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
192 temp = myGzipFile(temp.name, 'wb')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
193
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
194 return temp
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
195
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
196
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
197 def get_list(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
198 volumes = self.metadata['volumes']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
199 for ptr in range(0, len(volumes)):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
200 volume = volumes[ptr]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
201 if ptr < len(volumes)-1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
202 ceiling = str(volumes[ptr+1]['floor_id'] - 1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
203 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
204 ceiling = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
205 print "Volume " + str(ptr+1) + ", Versions " + str(volume['floor_id']) + "-" + ceiling
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
206
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
207 for version in volume['versions']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
208 print str(version['id']) + ": " + self.dateISOFormat(float(version['created'])) + '_v' + version['name']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
209
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
210
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
211 def set_metadata(self, type='text', compression=''):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
212 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
213 Request to initialize metadata file
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
214 Output metadata to stdio or to -o output file by way of temp file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
215 If one doesn't include -o, then output goes to stdio;
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
216 If one includes only -o, then output overwrites .md file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
217 If one includes -o [filename] output overwrites [filename]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
218
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
219 Algorithm processes each line as it comes in database. This means there
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
220 is no significance to the version_ids ordering; earlier items in list can
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
221 in fact be later versions of db. So must resort and re-assign ids in end.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
222 @param type string text or fasta etc.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
223 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
224 if os.path.isfile(self.metadata_file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
225 stop_err('Error: Metadata file "%s" exists. You must remove it before generating a new one.' % (self.metadata_file_path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
226
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
227 self.metadata_create(type, compression)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
228
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
229 volumes = glob.glob(self.db_master_file_path + '_[0-9]*')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
230
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
231 volumes.sort(key=lambda x: natural_sort_key(x))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
232 for volume in volumes:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
233 # Note: scanned volumes must be consecutive from 1. No error detection yet.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
234 self.metadata_create_volume(False)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
235 versions = self.metadata['volumes'][-1]['versions']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
236 import_modified = os.path.getmtime(volume)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
237 dbReader = bigFileReader(volume)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
238 version_ids = []
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
239 db_key_value = dbReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
240 old_key = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
241 while db_key_value:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
242
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
243 (created_vid, deleted_vid, db_key, restofline) = db_key_value.split(self.delim, 3)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
244 version = versions[self.version_dict_lookup(version_ids, long(created_vid), import_modified)]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
245 version['rows'] +=1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
246 if old_key != db_key:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
247 version['keys'] +=1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
248 old_key = db_key
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
249
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
250 version['inserts'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
251 if deleted_vid:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
252 version = versions[self.version_dict_lookup(version_ids, long(deleted_vid), import_modified)]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
253 version['deletes'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
254
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
255 db_key_value = dbReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
256
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
257 # Reorder, and reassign numeric version ids:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
258 versions.sort(key=lambda x: x['id'])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
259 for ptr, version in enumerate(versions):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
260 version['id'] = ptr+1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
261
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
262 # If first master db volume doesn't exist, then this is an initialization situation
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
263 if len(volumes) == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
264 self.metadata_create_volume()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
265 self.create_volume_file()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
266
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
267 with open(self.metadata_file_path,'w') as metadata_handle:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
268 metadata_handle.write(json.dumps(self.metadata, sort_keys=True, indent=4, separators=(',', ': ')))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
269
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
270 return True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
271
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
272
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
273 def get_metadata(self, options):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
274 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
275 Read in json metadata from file, and set file processor [fasta|text] engine accordingly.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
276 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
277
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
278 if not os.path.isfile(self.metadata_file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
279 #stop_err('Error: Metadata file "%s" does not exist. You must regenerate it with the -m option before performing other actions.' % (self.metadata_file_path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
280 stop_err('Error: Unable to locate the "%s" metadata file. It should accompany the "%s" file. Use the -M parameter to initialize or regenerate the basic file.' % (self.metadata_file_path, self.db_master_file_name) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
281
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
282 with open(self.metadata_file_path,'r') as metadata_handle:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
283 self.metadata = json.load(metadata_handle)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
284
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
285 # ******************* Select Kipper Pre/Post Processor **********************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
286 # FUTURE: More processor options here - including custom ones referenced in metadata
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
287 if self.metadata['type'] == 'fasta':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
288 self.processor = VDBFastaProcessor() # for fasta sequence databases
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
289 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
290 self.processor = VDBProcessor() # default text
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
291
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
292 # Handle any JSON metadata defaults here for items that aren't present in previous databases.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
293 if not 'compression' in self.metadata:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
294 self.metadata['compression'] = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
295
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
296
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
297 def write_metadata(self, content):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
298 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
299 Called when data store changes occur (revert and import).
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
300 If they are going to stdout then don't stream metadata there too.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
301 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
302 if self.output_file: self.write_metadata2(content)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
303
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
304
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
305 def write_metadata2(self,content):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
306
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
307 with (open(self.metadata_file_path,'w') if self.output_file else sys.stdout) as metadata_handle:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
308 metadata_handle.write(json.dumps(content, sort_keys=True, indent=4, separators=(',', ': ')))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
309
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
310
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
311 def metadata_create(self, type, compression, floor_id=1):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
312 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
313 Initial metadata structure
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
314 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
315 file_name = self.db_master_file_name.rsplit('.',1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
316 self.metadata = {
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
317 'version': CODE_VERSION,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
318 'name': self.db_master_file_name,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
319 'db_file_name': self.db_master_file_name,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
320 # A guess about what best base file name would be to write versions out as
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
321 'file_name': file_name[0] + '.' + type,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
322 'type': type,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
323 'description': '',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
324 'processor': '', # Processing that overrides type-matched processor.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
325 'compression': self.compression,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
326 'volumes': []
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
327 }
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
328
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
329
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
330 def metadata_create_volume(self, file_create = True):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
331 # Only add a volume if previous volume has at least 1 version in it.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
332 if len(self.metadata['volumes']) == 0 or len(self.metadata['volumes'][-1]['versions']) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
333 id = len(self.metadata['volumes']) + 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
334 volume = {
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
335 'floor_id': self.get_last_version()+1,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
336 'id': id,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
337 'versions': []
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
338 }
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
339 self.metadata['volumes'].append(volume)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
340 self.volume_id = id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
341 if file_create:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
342 self.create_volume_file()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
343
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
344
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
345 return id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
346
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
347 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
348 stop_err("Error: Didn't create a new volume because last one is empty already.")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
349
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
350
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
351 def create_volume_file(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
352
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
353 if self.metadata['compression'] == '.gz':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
354 gzip.open(self.get_db_path(), 'wb').close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
355 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
356 open(self.get_db_path(),'w').close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
357
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
358
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
359 def metadata_create_version(self, mydate, file_name = '', file_size = 0, version_name = None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
360 id = self.get_last_version()+1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
361 if version_name == None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
362 version_name = str(id)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
363
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
364 version = {
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
365 'id': id,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
366 'created': mydate,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
367 'name': version_name,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
368 'file_name': file_name,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
369 'file_size': file_size,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
370 'inserts': 0,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
371 'deletes': 0,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
372 'rows': 0,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
373 'keys': 0
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
374 }
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
375 self.metadata['volumes'][-1]['versions'].append(version)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
376
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
377 return version
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
378
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
379
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
380 def get_version(self, version_id = None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
381 if version_id is None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
382 version_id = self.version_id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
383
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
384 for volume in self.metadata['volumes']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
385 for version in volume['versions']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
386 if version_id == version['id']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
387 return (volume, version)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
388
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
389 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
390
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
391
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
392 def version_lookup(self, version_name):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
393 for volume in self.metadata['volumes']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
394 for version in volume['versions']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
395 if version_name == version['name']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
396 return (volume, version)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
397
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
398 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
399
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
400
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
401 def version_dict_lookup(self, version_ids, id, timestamp = None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
402 if id not in version_ids:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
403 version_ids.append(id)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
404 version = self.metadata_create_version(timestamp)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
405
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
406 return version_ids.index(id)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
407
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
408
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
409 #****************** Methods Involving Scan of Master Kipper file **********************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
410
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
411 def db_scan_action (self, action):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
412 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
413 #Python 2.6 needs this reopened if it was previously closed.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
414 #sys.stdout = open("/dev/stdout", "w")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
415 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
416 dbReader = bigFileReader(self.get_db_path())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
417 # Setup temp file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
418 if self.output_file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
419 temp_file = self.get_temp_output_file(action=action)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
420
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
421 # Use temporary file so that db_output_file_path switches to new content only when complete
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
422 with (temp_file if self.output_file else sys.stdout) as output:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
423 db_key_value = dbReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
424
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
425 while db_key_value:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
426 if action == KEYDB_EXTRACT:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
427 okLines = self.version_extract(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
428
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
429 elif action == KEYDB_REVERT:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
430 okLines = self.version_revert(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
431
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
432 if okLines:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
433 output.writelines(okLines)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
434
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
435 db_key_value = dbReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
436
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
437 # Issue: metadata lock while quick update with output_file???
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
438 if self.output_file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
439 if action == KEYDB_EXTRACT:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
440 self.processor.postprocess_file(temp_file.name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
441
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
442 # Is there a case where we fail to get to this point?
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
443 os.rename(temp_file.name, self.output_file)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
444
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
445 if action == KEYDB_REVERT:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
446 # When reverting, clear all volumes having versions > self.version_id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
447 # Takes out volume structure too.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
448 volumes = self.metadata['volumes']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
449 for volptr in range(len(volumes)-1, -1, -1):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
450 volume = volumes[volptr]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
451 if volume['floor_id'] > self.version_id: #TO REVERT IS TO KILL ALL LATER VOLUMES.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
452 os.remove(self.get_db_path(volume['id']))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
453 versions = volume['versions']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
454 for verptr in range(len(versions)-1, -1, -1):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
455 if versions[verptr]['id'] > self.version_id:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
456 popped = versions.pop(verptr)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
457 if len(versions) == 0 and volptr > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
458 volumes.pop(volptr)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
459
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
460 self.write_metadata(self.metadata)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
461
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
462
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
463 def db_scan_line(self, db_key_value):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
464 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
465 FUTURE: transact_code will signal how key/value should be interpreted, to
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
466 allow for differential change storage from previous entries.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
467 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
468 # (created_vid, deleted_vid, transact_code, restofline) = db_key_value.split(self.delim,3)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
469 (created_vid, deleted_vid, restofline) = db_key_value.split(self.delim,2)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
470 if deleted_vid: deleted_vid = long(deleted_vid)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
471 return (long(created_vid), deleted_vid, restofline)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
472
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
473
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
474 def version_extract(self, db_key_value):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
475 (created_vid, deleted_vid, restofline) = self.db_scan_line(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
476
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
477 if created_vid <= self.version_id and (not deleted_vid or deleted_vid > self.version_id):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
478 return self.processor.postprocess_line(restofline)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
479
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
480 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
481
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
482
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
483 def version_revert(self, db_key_value):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
484 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
485 Reverting database here.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
486 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
487 (created_vid, deleted_vid, restofline) = self.db_scan_line(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
488
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
489 if created_vid <= self.version_id:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
490 if (not deleted_vid) or deleted_vid <= self.version_id:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
491 return [str(created_vid) + self.delim + str(deleted_vid) + self.delim + restofline]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
492 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
493 return [str(created_vid) + self.delim + self.delim + restofline]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
494 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
495
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
496
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
497 def check_date_input(self, options):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
498 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
499 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
500 if options.unixTime != None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
501 try:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
502 _userTime = float(options.unixTime)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
503 # if it is not a float, triggers exception
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
504 except ValueError:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
505 stop_err("Given Unix time could not be parsed [" + options.unixTime + "]. Format should be [integer]")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
506
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
507 elif options.dateTime != None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
508
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
509 try:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
510 _userTime = parse_date(options.dateTime)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
511
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
512 except ValueError:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
513 stop_err("Given date could not be parsed [" + options.dateTime + "]. Format should include at least the year, and any of the other more granular parts, in order: YYYY/MM/DD [H:M:S AM/PM]")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
514
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
515 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
516 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
517
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
518 _dtobject = datetime.datetime.fromtimestamp(float(_userTime)) #
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
519 self.dateTime = long(_dtobject.strftime("%s"))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
520
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
521
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
522 # Now see if we can set version_id by it. We look for version_id that has created <= self.dateTime
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
523 for volume in self.metadata['volumes']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
524 for version in volume['versions']:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
525 if version['created'] <= self.dateTime:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
526 self.version_id = version['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
527 self.volume_id = volume['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
528 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
529 break
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
530
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
531 return True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
532
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
533
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
534 def check_folder(self, file_path, message = "Output directory for "):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
535 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
536 Ensures file folder path for output file exists.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
537 We don't want to create output in a mistaken location.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
538 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
539 if file_path != None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
540
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
541 path = os.path.normpath(file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
542 if not os.path.isdir(os.path.dirname(path)):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
543 # Not an absolute path, so try default folder where script launched from:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
544 path = os.path.normpath(os.path.join(os.getcwd(), path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
545 if not os.path.isdir(os.path.dirname(path)):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
546 stop_err(message + "[" + path + "] does not exist!")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
547
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
548 return path
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
549 return None
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
550
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
551
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
552 def check_file_path(self, file, message = "File "):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
553
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
554 path = os.path.normpath(file)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
555 # make sure any relative paths are converted to absolute ones
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
556 if not os.path.isdir(os.path.dirname(path)) or not os.path.isfile(path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
557 # Not an absolute path, so try default folder where script was called:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
558 path = os.path.normpath(os.path.join(os.getcwd(),path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
559 if not os.path.isfile(path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
560 stop_err(message + "[" + path + "] doesn't exist!")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
561 return path
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
562
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
563
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
564 def try_import_file(self, options):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
565 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
566 Create new version from comparison of import data file against Kipper
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
567 Note "-o ." parameter enables writing back to master database.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
568 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
569 self.db_import_file_path = self.check_file_path(options.db_import_file_path, "Import data file ")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
570
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
571 check_file = self.processor.preprocess_validate_file(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
572 if not check_file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
573 stop_err("Import data file isn't sorted or composed correctly!")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
574
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
575 # SET version date to creation date of import file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
576 import_modified = os.path.getmtime(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
577
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
578 original_name = os.path.basename(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
579 # creates a temporary file, which has conversion into 1 line key-value records
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
580 temp = self.processor.preprocess_file(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
581 if (temp):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
582
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
583 self.db_import_file_path = temp.name
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
584
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
585 self.import_file(original_name, import_modified, options.version_index)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
586
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
587 os.remove(temp.name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
588
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
589
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
590 def import_file(self, file_name, import_modified, version_index = None):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
591 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
592 Imports from an import file (or temp file if transformation done above) to
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
593 temp Kipper version which is copied over to main database on completion.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
594
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
595 Import algorithm only works if the import file is already sorted in the same way as the Kipper database file
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
596
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
597 @uses self.db_import_file_path string A file full of one line key[tab]value records.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
598 @uses self.output_file string A file to save results in. If empty, then stdio.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
599
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
600 @uses dateTime string Date time to mark created/deleted records by.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
601 @puses delim char Separator between key/value pairs.ake it the function.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
602
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
603 @param file_name name of file being imported. This is stored in version record so that output file will be the same.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
604 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
605 delim = self.delim
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
606
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
607
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
608 file_size = os.path.getsize(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
609 if version_index == None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
610 version_index = str(self.get_last_version()+1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
611
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
612 self.volume_id = self.metadata['volumes'][-1]['id'] #For get_db_path() call below.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
613
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
614 if self.output_file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
615 temp_file = self.get_temp_output_file(action=KEYDB_IMPORT, path=self.get_db_path())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
616
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
617 # We want to update database here when output file is db itself.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
618 if os.path.isdir(self.output_file):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
619 self.output_file = self.get_db_path()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
620
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
621 version = self.metadata_create_version(import_modified, file_name, file_size, version_index)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
622 version_id = str(version['id'])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
623
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
624 with (temp_file if self.output_file else sys.stdout) as outputFile :
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
625 dbReader = bigFileReader(self.get_db_path())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
626 importReader = bigFileReader(self.db_import_file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
627 old_import_key=''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
628
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
629 while True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
630
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
631 db_key_value = dbReader.turn()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
632 #if import_key_value
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
633 import_key_value = importReader.turn()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
634
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
635 # Skip empty or whitespace lines:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
636 if import_key_value and len(import_key_value.lstrip()) == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
637 import_key_value = importReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
638 continue
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
639
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
640 if not db_key_value: # eof
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
641 while import_key_value: # Insert remaining import lines:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
642 (import_key, import_value) = self.get_key_value(import_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
643 outputFile.write(version_id + delim + delim + import_key + delim + import_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
644 import_key_value = importReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
645 version['inserts'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
646 version['rows'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
647
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
648 if import_key != old_import_key:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
649 version['keys'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
650 old_import_key = import_key
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
651
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
652 break # Both inputs are eof, so exit
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
653
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
654 elif not import_key_value: # db has key that import file no longer has, so mark each subsequent db line as a delete of the key (if it isn't already)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
655 while db_key_value:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
656 (created_vid, deleted_vid, dbKey, dbValue) = db_key_value.split(delim,3)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
657 version['rows'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
658
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
659 if deleted_vid:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
660 outputFile.write(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
661 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
662 outputFile.write(created_vid + delim + version_id + delim + dbKey + delim + dbValue)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
663 version['deletes'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
664
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
665 db_key_value = dbReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
666 break
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
667
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
668 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
669 (import_key, import_value) = self.get_key_value(import_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
670 (created_vid, deleted_vid, dbKey, dbValue) = db_key_value.split(delim,3)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
671
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
672 if import_key != old_import_key:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
673 version['keys'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
674 old_import_key = import_key
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
675
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
676 # All cases below lead to writing a row ...
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
677 version['rows'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
678
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
679 if import_key == dbKey:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
680 # When the keys match, we have enough information to act on the current db_key_value content;
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
681 # therefore ensure on next pass that we read it.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
682 dbReader.step()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
683
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
684 if import_value == dbValue:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
685 outputFile.write(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
686
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
687 # All past items marked with insert will also have a delete. Step until we find one
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
688 # not marked as a delete... or a new key.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
689 if deleted_vid: # Good to go in processing next lines in both files.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
690 pass
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
691 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
692 importReader.step()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
693
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
694 else: # Case where value changed - so process all db_key_values until key no longer matches.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
695
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
696 # Some future pass will cause import line to be written to db
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
697 # (when key mismatch occurs) as long as we dont advance it (prematurely).
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
698 if deleted_vid:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
699 #preserve deletion record.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
700 outputFile.write(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
701
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
702 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
703 # Mark record deletion
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
704 outputFile.write(created_vid + delim + version_id + delim + dbKey + delim + dbValue)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
705 version['deletes'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
706 # Then advance since new key/value means new create
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
707
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
708 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
709 # Natural sort doesn't do text sort on numeric parts, ignores capitalization.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
710 dbKeySort = natural_sort_key(dbKey)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
711 import_keySort = natural_sort_key(import_key)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
712 # False if dbKey less; Means db key is no longer in sync db,
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
713 if cmp(dbKeySort, import_keySort) == -1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
714
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
715 if deleted_vid: #Already marked as a delete
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
716 outputFile.write(db_key_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
717
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
718 else: # Write dbKey as a new delete
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
719 outputFile.write(created_vid + delim + version_id + delim + dbKey + delim + dbValue)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
720 version['deletes'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
721 # Advance ... there could be another db_key_value for deletion too.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
722 dbReader.step()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
723
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
724 else: #DB key is greater, so insert import_key,import_value in db.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
725 # Write a create record
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
726 outputFile.write(version_id + delim + delim + import_key + delim + import_value)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
727 version['inserts'] += 1
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
728 importReader.step() # Now compare next two candidates.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
729
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
730 if self.output_file:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
731 # Kipper won't write an empty version - since this is usually a mistake.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
732 # If user has just added new volume though, then slew of inserts will occur
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
733 # even if version is identical to tail end of previous volume version.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
734 if version['inserts'] > 0 or version['deletes'] > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
735 #print "Temp file:" + temp_file.name
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
736 os.rename(temp_file.name, self.output_file)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
737 self.write_metadata(self.metadata)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
738 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
739 os.remove(temp_file.name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
740
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
741
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
742 def get_last_version(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
743 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
744 Returns first Volume version counting from most recent.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
745 Catch is that some volume might be empty, so have to go to previous one
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
746 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
747 for ptr in range(len(self.metadata['volumes'])-1, -1, -1):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
748 versions = self.metadata['volumes'][ptr]['versions']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
749 if len(versions) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
750 return versions[-1]['id']
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
751
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
752 return 0
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
753
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
754
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
755 # May want to move this to individual data store processor since it can be sensitive to different kinds of whitespace then.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
756 def get_key_value(self, key_value):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
757 # ACCEPTS SPLIT AT ANY WHITESPACE PAST KEY BY DEFAULT
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
758 kvparse = key_value.split(None,1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
759 #return (key_value[0:kvptr], key_value[kvptr:].lstrip())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
760 return (kvparse[0], kvparse[1] if len(kvparse) >1 else '')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
761
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
762
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
763 def dateISOFormat(self, atimestamp):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
764 return datetime.datetime.isoformat(datetime.datetime.fromtimestamp(atimestamp))
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
765
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
766
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
767 def get_command_line(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
768 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
769 *************************** Parse Command Line *****************************
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
770
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
771 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
772 parser = MyParser(
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
773 description = 'Maintains versions of a file-based database with comparison to full-copy import file updates.',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
774 usage = 'kipper.py [kipper database file] [options]*',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
775 epilog="""
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
776
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
777 All outputs go to stdout and affect no change in Kipper database unless the '-o' parameter is supplied. (The one exception to this is when the -M regenerate metadata command is provided, as described below.) Thus by default one sees what would happen if an action were taken, but must take an additional step to affect the data.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
778
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
779 '-o .' is a special request that leads to:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
780 * an update of the Kipper database for --import or --revert actions
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
781 * an update of the .md file for -M --rebuild action
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
782
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
783 As well, when -o parameter is a path, and not a specific filename, then kipper.py looks up what the appropriate output file name is according to the metadata file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
784
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
785 USAGE
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
786
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
787 Initialize metadata file and Kipper file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
788 kipper.py [database file] -M --rebuild [type of database:text|fasta]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
789
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
790 View metadata (json) file.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
791 kipper.py [database file] -m --metadata
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
792
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
793 Import key/value inserts/deletes based on import file. (Current date used).
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
794 kipper.py [database file] -i --import [import file]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
795 e.g.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
796 kipper.py cpn60 -i sequences.fasta # outputs new master database to stdout; doesn't rewrite it.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
797 kipper.py cpn60 -i sequences.fasta -o . # rewrites cpn60 with new version added.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
798
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
799 Extract a version of the file based on given date/time
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
800 kipper.py [database file] -e --extract -d datetime -o [output file]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
801
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
802 Extract a version of the file based on given version Id
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
803 kipper.py [database file] -e --extract -n [version id] -o [output file]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
804
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
805 List versions of dbFile key/value pairs (by date/time)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
806 kipper.py [database file]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
807 kipper.py [database file] -l --list
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
808
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
809 Have database revert to previous version. Drops future records, unmarks corresponding deletes.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
810 kipper.py [database file] -r --revert -d datetime -o [output file]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
811
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
812 Return version of this code:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
813 kipper.py -v --version
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
814 """)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
815
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
816 # Data/Metadata changing actions
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
817 parser.add_option('-M', '--rebuild', type='choice', dest='initialize', choices=['text','fasta'],
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
818 help='(Re)generate metadata file [name of db].md . Provide the type of db [text|fasta| etc.].')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
819
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
820 parser.add_option('-i', '--import', type='string', dest='db_import_file_path',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
821 help='Import key/value inserts/deletes based on delta comparison with import file')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
822
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
823 parser.add_option('-e', '--extract', dest='extract', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
824 help='Extract a version of the file based on given date/time')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
825
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
826 parser.add_option('-r', '--revert', dest='revert', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
827 help='Have database revert to previous version (-d date/time required). Drops future records, unmarks corresponding deletes.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
828
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
829 parser.add_option('-V', '--volume', dest='volume', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
830 help='Add a new volume to the metadata. New imports will be added here.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
831
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
832 # Passive actions
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
833 parser.add_option('-m', '--metadata', dest='metadata', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
834 help='View metadata file [name of db].md')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
835
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
836 parser.add_option('-l', '--list', dest='list', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
837 help='List versions of dbFile key/value pairs (by date/time)')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
838
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
839 parser.add_option('-c', '--compression', dest='compression', type='choice', choices=['.gz'],
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
840 help='Enable compression of database. options:[.gz]')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
841
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
842 # Used "v" for standard code version identifier.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
843 parser.add_option('-v', '--version', dest='code_version', default=False, action='store_true',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
844 help='Return version of kipper.py code.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
845
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
846 parser.add_option('-o', '--output', type='string', dest='db_output_file_path',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
847 help='Output to this file. Default is to stdio')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
848
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
849 parser.add_option('-I', '--index', type='string', dest='version_index',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
850 help='Provide title (index) e.g. "1.4" of version being imported/extracted.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
851
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
852 parser.add_option('-d', '--date', type='string', dest='dateTime',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
853 help='Provide date/time for sync, extract or revert operations. Defaults to now.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
854 parser.add_option('-u', '--unixTime', type='int', dest='unixTime',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
855 help='Provide Unix time (integer) for sync, extract or revert operations.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
856 parser.add_option('-n', '--number', type='int', dest='version_id',
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
857 help='Provide a version id to extract or revert to.')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
858
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
859 return parser.parse_args()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
860
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
861
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
862 class VDBProcessor(object):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
863
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
864 delim = '\t'
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
865 nl = '\n'
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
866
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
867 #def preprocess_line(self, line):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
868 # return [line]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
869
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
870 def preprocess_file(self, file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
871 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False, dir=os.path.dirname(file_path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
872 copy (file_path, temp.name)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
873 temp.close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
874 sort_a = subprocess.call(['sort','-sfV','-t\t','-k1,1', '-o',temp.name, temp.name])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
875 return temp #Enables temp file name to be used by caller.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
876
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
877
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
878 def preprocess_validate_file(self, file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
879
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
880 # Do import file preprocessing:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
881 # 1) Mechanism to verify if downloaded file is complete - check md5 hash?
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
882 # 4) Could test file.newlines(): returns \r, \n, \r\n if started to read file (1st line).
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
883 # 5) Could auto-uncompress .tar.gz, bz2 etc.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
884 # Ensures "[key] [value]" entries are sorted
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
885 # "sort --check ..." returns nothing if sorted, or e.g "sort: sequences_A.fastx.sorted:12: disorder: >114 AJ009959.1 … "
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
886
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
887 # if not subprocess.call(['sort','--check','-V',db_import_file_path]): #very fast check
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
888 # subprocess.call(['sort','-V',db_import_file_path]):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
889
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
890 return True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
891
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
892 def postprocess_line(self, line):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
893 #Lines are placed in array so that one can map to many in output file
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
894 return [line]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
895
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
896 def postprocess_file(self, file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
897 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
898
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
899 def sort(self, a, b):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
900 pass
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
901
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
902
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
903 class VDBFastaProcessor(VDBProcessor):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
904
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
905
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
906 def preprocess_file(self, file_path):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
907 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
908 Converts input fasta data into one line tab-delimited record format, then sorts.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
909 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
910 temp = tempfile.NamedTemporaryFile(mode='w+t',delete=False, dir=os.path.dirname(file_path) )
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
911 fileReader = bigFileReader(file_path)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
912 line = fileReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
913 old_line = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
914 while line:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
915 line = line.strip()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
916 if len(line) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
917
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
918 if line[0] == '>':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
919 if len(old_line):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
920 temp.write(old_line + self.nl)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
921 lineparse = line.split(None,1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
922 key = lineparse[0].strip()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
923 if len(lineparse) > 1:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
924 description = lineparse[1].strip().replace(self.delim, ' ')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
925 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
926 description = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
927 old_line = key[1:] + self.delim + description + self.delim
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
928
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
929 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
930 old_line = old_line + line
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
931
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
932 line = fileReader.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
933
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
934 if len(old_line)>0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
935 temp.write(old_line+self.nl)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
936
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
937 temp.close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
938
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
939 # Is this a consideration for natural sort in Python vs bash sort?:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
940 # *** WARNING *** The locale specified by the environment affects sort order.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
941 # Set LC_ALL=C to get the traditional sort order that uses native byte values.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
942 #-s stable; -f ignore case; V natural sort (versioning) ; -k column, -t tab delimiter
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
943 sort_a = subprocess.call(['sort', '-sfV', '-t\t', '-k1,1', '-o',temp.name, temp.name])
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
944
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
945 return temp #Enables temp file name to be used by caller.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
946
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
947
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
948 def postprocess_line(self, line):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
949 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
950 Transform Kipper fasta 1 line format key/value back into output file line(s) - an array
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
951
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
952 @param line string containing [accession id][TAB][description][TAB][fasta sequence]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
953 @return string containing lines each ending with newline, except end.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
954 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
955 line_data = line.split('\t',2)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
956 # Set up ">[accession id] [description]\n" :
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
957 fasta_header = '>' + ' '.join(line_data[0:2]) + '\n'
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
958 # Put fasta sequences back into multi-line; note trailing item has newline.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
959 sequences= self.split_len(line_data[2],80)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
960 if len(sequences) and sequences[-1].strip() == '':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
961 sequences[-1] = ''
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
962
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
963 return fasta_header + '\n'.join(sequences)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
964
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
965
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
966 def split_len(self, seq, length):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
967 return [seq[i:i+length] for i in range(0, len(seq), length)]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
968
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
969
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
970 class bigFileReader(object):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
971 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
972 This provides some advantage over reading line by line, and as well has a system
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
973 for skipping/not advancing reads - it has a memory via "take_step" about whether
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
974 it should advance or not - this is used when the master database and the import
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
975 database are feeding lines into a new database.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
976
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
977 Interestingly, using readlines() with byte hint parameter less
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
978 than file size improves performance by at least 30% over readline().
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
979
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
980 FUTURE: Adjust buffer lines dynamically based on file size/lines ratio?
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
981 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
982
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
983 def __init__(self, filename):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
984 self.lines = []
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
985 # This simply allows any .gz repository to be opened
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
986 # It isn't connected to the Kipper metadata['compression'] feature.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
987 if filename[-3:] == '.gz':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
988 self.file = gzip.open(filename,'rb')
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
989 else:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
990 self.file = open(filename, 'rb', 1)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
991
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
992 self.line = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
993 self.take_step = True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
994 self.buffer_size=1000 # Number of lines to read into buffer.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
995
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
996
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
997 def turn(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
998 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
999 When accessing bigFileReader via turn mechanism, we get current line if no step;
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1000 otherwise with step we read new line.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1001 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1002 if self.take_step == True:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1003 self.take_step = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1004 return self.read()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1005 return self.line
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1006
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1007
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1008 def read(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1009 if len(self.lines) == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1010 self.lines = self.file.readlines(self.buffer_size)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1011 if len(self.lines) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1012 self.line = self.lines.pop(0)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1013 #if len(self.lines) == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1014 # self.lines = self.file.readlines(self.buffer_size)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1015 #make sure each line doesn't include carriage return
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1016 return self.line
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1017
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1018 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1019
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1020
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1021 def readlines(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1022 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1023 Small efficiency:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1024 A test on self.lines after readLines() call can control loop.
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1025 Bulk write of remaining buffer; ensures lines array isn't copied
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1026 but is preserved when self.lines is removed
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1027 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1028 self.line = False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1029 if len(self.lines) == 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1030 self.lines = self.file.readlines(self.buffer_size)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1031 if len(self.lines) > 0:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1032 shallowCopy = self.lines[:]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1033 self.lines = self.file.readlines(self.buffer_size)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1034 return shallowCopy
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1035 return False
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1036
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1037
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1038 def step(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1039 self.take_step = True
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1040
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1041
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1042
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1043 # Enables use of with ... syntax. See https://mail.python.org/pipermail/tutor/2009-November/072959.html
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1044 class myGzipFile(gzip.GzipFile):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1045 def __enter__(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1046 if self.fileobj is None:
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1047 raise ValueError("I/O operation on closed GzipFile object")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1048 return self
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1049
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1050 def __exit__(self, *args):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1051 self.close()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1052
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1053
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1054 def natural_sort_key(s, _nsre = REGEX_NATURAL_SORT):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1055 return [int(text) if text.isdigit() else text.lower()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1056 for text in re.split(_nsre, s)]
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1057
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1058
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1059 def generic_linux_sort(self):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1060 import locale
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1061 locale.setlocale(locale.LC_ALL, "C")
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1062 yourList.sort(cmp=locale.strcoll)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1063
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1064
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1065 def parse_date(adate):
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1066 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1067 Convert human-entered time into linux integer timestamp
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1068
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1069 @param adate string Human entered date to parse into linux time
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1070
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1071 @return integer Linux time equivalent or 0 if no date supplied
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1072 """
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1073 adate = adate.strip()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1074 if adate > '':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1075 adateP = parser2.parse(adate, fuzzy=True)
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1076 #dateP2 = time.mktime(adateP.timetuple())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1077 # This handles UTC & daylight savings exactly
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1078 return calendar.timegm(adateP.timetuple())
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1079 return 0
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1080
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1081
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1082 if __name__ == '__main__':
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1083
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1084 kipper = Kipper()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1085 kipper.__main__()
5c5027485f7d Uploaded correct file
damion
parents:
diff changeset
1086