6
|
1 # Copyright INRA (Institut National de la Recherche Agronomique)
|
|
2 # http://www.inra.fr
|
|
3 # http://urgi.versailles.inra.fr
|
|
4 #
|
|
5 # This software is governed by the CeCILL license under French law and
|
|
6 # abiding by the rules of distribution of free software. You can use,
|
|
7 # modify and/ or redistribute the software under the terms of the CeCILL
|
|
8 # license as circulated by CEA, CNRS and INRIA at the following URL
|
|
9 # "http://www.cecill.info".
|
|
10 #
|
|
11 # As a counterpart to the access to the source code and rights to copy,
|
|
12 # modify and redistribute granted by the license, users are provided only
|
|
13 # with a limited warranty and the software's author, the holder of the
|
|
14 # economic rights, and the successive licensors have only limited
|
|
15 # liability.
|
|
16 #
|
|
17 # In this respect, the user's attention is drawn to the risks associated
|
|
18 # with loading, using, modifying and/or developing or reproducing the
|
|
19 # software by the user in light of its specific status of free software,
|
|
20 # that may mean that it is complicated to manipulate, and that also
|
|
21 # therefore means that it is reserved for developers and experienced
|
|
22 # professionals having in-depth computer knowledge. Users are therefore
|
|
23 # encouraged to load and test the software's suitability as regards their
|
|
24 # requirements in conditions enabling the security of their systems and/or
|
|
25 # data to be ensured and, more generally, to use and operate it in the
|
|
26 # same conditions as regards security.
|
|
27 #
|
|
28 # The fact that you are presently reading this means that you have had
|
|
29 # knowledge of the CeCILL license and that you accept its terms.
|
|
30
|
|
31
|
|
32 import os
|
|
33 import re
|
|
34 import sys
|
|
35 import math
|
|
36 import glob
|
|
37 import shutil
|
|
38 import subprocess
|
|
39 from operator import itemgetter
|
|
40 try:
|
|
41 import hashlib
|
|
42 except:
|
|
43 pass
|
|
44
|
|
45
|
|
46 class FileUtils( object ):
|
|
47
|
|
48 ## Return the number of lines in the given file
|
|
49 #
|
|
50 @staticmethod
|
|
51 def getNbLinesInSingleFile( fileName ):
|
|
52 cmd = "wc -l %s" % fileName
|
|
53 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0]
|
|
54 nbLines = int(r.split()[0])
|
|
55
|
|
56 toAdd = 0
|
|
57 if nbLines:
|
|
58 cmd = "tail -1 %s" % fileName
|
|
59 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0]
|
|
60
|
|
61 if r == '\n':
|
|
62 toAdd -= 1
|
|
63 elif '\n' not in r:
|
|
64 toAdd += 1
|
|
65
|
|
66 return nbLines + toAdd
|
|
67
|
|
68 ## Return the number of lines in the files in the given list
|
|
69 #
|
|
70 @staticmethod
|
|
71 def getNbLinesInFileList( lFileNames ):
|
|
72 count = 0
|
|
73 for fileName in lFileNames:
|
|
74 count += FileUtils.getNbLinesInSingleFile( fileName )
|
|
75 return count
|
|
76
|
|
77 ## Return True if the given file exists, False otherwise
|
|
78 #
|
|
79 @staticmethod
|
|
80 def isRessourceExists( fileName ):
|
|
81 return os.path.exists( fileName )
|
|
82
|
|
83 ## Return True if the given file is empty, False otherwise
|
|
84 #
|
|
85 @staticmethod
|
|
86 def isEmpty( fileName ):
|
|
87 return 0 == FileUtils.getNbLinesInSingleFile( fileName )
|
|
88
|
|
89 ## Return True if both files are identical, False otherwise
|
|
90 #
|
|
91 @staticmethod
|
|
92 def are2FilesIdentical( file1, file2 ):
|
|
93 tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )
|
|
94 cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )
|
|
95 returnStatus = os.system( cmd )
|
|
96 if returnStatus != 0:
|
|
97 print "WARNING: 'diff' returned '%i'" % returnStatus
|
|
98 os.remove( tmpFile )
|
|
99 return False
|
|
100 if FileUtils.isEmpty( tmpFile ):
|
|
101 os.remove( tmpFile )
|
|
102 return True
|
|
103 else:
|
|
104 os.remove( tmpFile )
|
|
105 return False
|
|
106
|
|
107 ## Return a string with all the content of the files in the given list
|
|
108 #
|
|
109 @staticmethod
|
|
110 def getFileContent( lFiles ):
|
|
111 content = ""
|
|
112 lFiles.sort()
|
|
113 for fileName in lFiles:
|
|
114 currentFile = open( fileName, "r" )
|
|
115 content += currentFile.read()
|
|
116 currentFile.close()
|
|
117 return content
|
|
118
|
|
119 ## Save content of the given file after having sorted it
|
|
120 #
|
|
121 @staticmethod
|
|
122 def sortFileContent( inFile, outFile="" ):
|
|
123 inFileHandler = open(inFile, "r" )
|
|
124 lines = inFileHandler.readlines()
|
|
125 inFileHandler.close()
|
|
126 lines.sort()
|
|
127 if outFile == "":
|
|
128 outFile = inFile
|
|
129 outFileHandler = open( outFile, "w" )
|
|
130 outFileHandler.writelines( lines )
|
|
131 outFileHandler.close()
|
|
132
|
|
133 ## Add end-of-line symbol to the given file content if necessary
|
|
134 #
|
|
135 @staticmethod
|
|
136 def addNewLineAtTheEndOfFileContent( fileContent ):
|
|
137 if not fileContent.endswith('\n') and len(fileContent) != 0:
|
|
138 fileContent += '\n'
|
|
139 return fileContent
|
|
140
|
|
141 ## Concatenate files in the given list
|
|
142 #
|
|
143 @staticmethod
|
|
144 def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ):
|
|
145 if sort:
|
|
146 lFiles.sort()
|
|
147 outFileHandler = open( outFile, "a" )
|
|
148 isFirstFile = True
|
|
149 for singleFile in lFiles:
|
|
150 if not isFirstFile:
|
|
151 outFileHandler.write(separator)
|
|
152 isFirstFile = False
|
|
153 singleFileHandler = open( singleFile, "r" )
|
|
154 if skipHeaders:
|
|
155 singleFileHandler.readline()
|
|
156 line = singleFileHandler.readline()
|
|
157 while line:
|
|
158 outFileHandler.write(line)
|
|
159 line = singleFileHandler.readline()
|
|
160 singleFileHandler.close()
|
|
161 outFileHandler.close()
|
|
162
|
|
163 ## Concatenate files according to the given pattern
|
|
164 #
|
|
165 @staticmethod
|
|
166 def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ):
|
|
167 lFiles = glob.glob( pattern )
|
|
168 FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator )
|
|
169
|
|
170 ## Cat all files of a given directory
|
|
171 #
|
|
172 # @param dir string directory name
|
|
173 # @param outFileName string output file name
|
|
174 #
|
|
175 @staticmethod
|
|
176 def catFilesOfDir(directory, outFileName):
|
|
177 FileUtils.catFilesByPattern("%s/*" % directory, outFileName)
|
|
178
|
|
179 ## Remove files listed according to the given pattern
|
|
180 #
|
|
181 # @example prefix="/home/tmp/dummy*.txt"
|
|
182 #
|
|
183 @staticmethod
|
|
184 def removeFilesByPattern( prefix ):
|
|
185 lFiles = glob.glob( prefix )
|
|
186 for f in lFiles:
|
|
187 os.remove( f )
|
|
188
|
|
189 ## Remove files listed according to the suffixes in the given list
|
|
190 #
|
|
191 @staticmethod
|
|
192 def removeFilesBySuffixList( targetPath, lSuffixes ):
|
|
193 if targetPath[-1] == "/":
|
|
194 targetPath = targetPath[:-1]
|
|
195 for suffix in lSuffixes:
|
|
196 pattern = "%s/*%s" % ( targetPath, suffix )
|
|
197 FileUtils.removeFilesByPattern( pattern )
|
|
198
|
|
199 ## Remove repeated blanks in the given file
|
|
200 #
|
|
201 @staticmethod
|
|
202 def removeRepeatedBlanks( inFile, outFile="" ):
|
|
203 if outFile == "":
|
|
204 outFile = inFile
|
|
205 tmpFile = "tr_%s_%s" % ( inFile, outFile )
|
|
206 cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile )
|
|
207 os.system( cmd )
|
|
208 os.rename( tmpFile, outFile )
|
|
209
|
|
210 ## Remove files in the given list
|
|
211 #
|
|
212 @staticmethod
|
|
213 def removeFilesFromList(lFiles):
|
|
214 for f in lFiles:
|
|
215 os.remove(f)
|
|
216
|
|
217 ## Remove files in the given list if exist
|
|
218 #
|
|
219 @staticmethod
|
|
220 def removeFilesFromListIfExist(lFiles):
|
|
221 for fileName in lFiles:
|
|
222 if FileUtils.isRessourceExists(fileName):
|
|
223 os.remove(fileName)
|
|
224
|
|
225 ## Append the content of a file to another file
|
|
226 #
|
|
227 # @param inFile string name of the input file
|
|
228 # @param outFile string name of the output file
|
|
229 #
|
|
230 @staticmethod
|
|
231 def appendFileContent( inFile, outFile ):
|
|
232 outFileHandler = open( outFile, "a" )
|
|
233 inFileHandler = open( inFile, "r" )
|
|
234 shutil.copyfileobj( inFileHandler, outFileHandler )
|
|
235 inFileHandler.close()
|
|
236 outFileHandler.close()
|
|
237
|
|
238
|
|
239 ## Replace Windows end-of-line by Unix end-of-line
|
|
240 #
|
|
241 @staticmethod
|
|
242 def fromWindowsToUnixEof( inFile ):
|
|
243 tmpFile = "%s.tmp" % ( inFile )
|
|
244 shutil.copyfile( inFile, tmpFile )
|
|
245 os.remove( inFile )
|
|
246 tmpFileHandler = open( tmpFile, "r" )
|
|
247 inFileHandler = open( inFile, "w" )
|
|
248 while True:
|
|
249 line = tmpFileHandler.readline()
|
|
250 if line == "":
|
|
251 break
|
|
252 inFileHandler.write( line.replace("\r\n","\n") )
|
|
253 tmpFileHandler.close()
|
|
254 inFileHandler.close()
|
|
255 os.remove( tmpFile )
|
|
256
|
|
257
|
|
258 ## Remove duplicated lines in a file
|
|
259 #
|
|
260 # @note it preserves the initial order and handles blank lines
|
|
261 #
|
|
262 @staticmethod
|
|
263 def removeDuplicatedLines( inFile ):
|
|
264 tmpFile = "%s.tmp" % ( inFile )
|
|
265 shutil.copyfile( inFile, tmpFile )
|
|
266 os.remove( inFile )
|
|
267
|
|
268 tmpFileHandler = open( tmpFile, "r" )
|
|
269 lLines = list( tmpFileHandler.read().split("\n") )
|
|
270 if lLines[-1] == "":
|
|
271 del lLines[-1]
|
|
272 sLines = set( lLines )
|
|
273 tmpFileHandler.close()
|
|
274 os.remove( tmpFile )
|
|
275
|
|
276 inFileHandler = open( inFile, "w" )
|
|
277 for line in lLines:
|
|
278 if line in sLines:
|
|
279 inFileHandler.write( "%s\n" % ( line ) )
|
|
280 sLines.remove( line )
|
|
281 inFileHandler.close()
|
|
282
|
|
283
|
|
284 ## Write a list of lines in a given file
|
|
285 #
|
|
286 @staticmethod
|
|
287 def writeLineListInFile( inFile, lLines ):
|
|
288 inFileHandler = open( inFile, "w" )
|
|
289 for line in lLines:
|
|
290 inFileHandler.write( line )
|
|
291 inFileHandler.close()
|
|
292
|
|
293
|
|
294 ## Give the list of absolute path of each directory in the given directory
|
|
295 #
|
|
296 # @param rootPath string absolute path of the given directory
|
|
297 #
|
|
298 # @return lDirPath list of absolute directory path
|
|
299 #
|
|
300 @staticmethod
|
|
301 def getAbsoluteDirectoryPathList(rootPath):
|
|
302 lDirPath = []
|
|
303 lPaths = glob.glob(rootPath + "/*")
|
|
304 for ressource in lPaths:
|
|
305 if os.path.isdir(ressource) :
|
|
306 lDirPath.append(ressource)
|
|
307 return lDirPath
|
|
308
|
|
309
|
|
310 ## Get a sublist of which each element matches/doesn't match a pattern
|
|
311 #
|
|
312 # @param lPath string list of paths
|
|
313 #
|
|
314 # @param pattern string pattern
|
|
315 #
|
|
316 # @param match bool
|
|
317 #
|
|
318 # @return lPathMatching list of path matching pattern
|
|
319 #
|
|
320 @staticmethod
|
|
321 def getSubListAccordingToPattern(lPath, pattern, match = True):
|
|
322 lPathMatching = []
|
|
323 for path in lPath:
|
|
324 if match:
|
|
325 if re.match(".*%s.*" % pattern, path):
|
|
326 lPathMatching.append(path)
|
|
327 else:
|
|
328 if not re.match(".*%s.*" % pattern, path):
|
|
329 lPathMatching.append(path)
|
|
330 return lPathMatching
|
|
331
|
|
332
|
|
333 ## Give the list of file names found in the given directory
|
|
334 #
|
|
335 # @param dirPath string absolute path of the given directory
|
|
336 #
|
|
337 # @return lFilesInDir list of file names
|
|
338 #
|
|
339 @staticmethod
|
|
340 def getFileNamesList( dirPath, patternFileFilter = ".*" ):
|
|
341 lFilesInDir = []
|
|
342 lPaths = glob.glob( dirPath + "/*" )
|
|
343 for ressource in lPaths:
|
|
344 if os.path.isfile( ressource ):
|
|
345 fileName = os.path.basename( ressource )
|
|
346 if re.match(patternFileFilter, fileName):
|
|
347 lFilesInDir.append( fileName )
|
|
348 return lFilesInDir
|
|
349
|
|
350 ## Return the MD5 sum of a file
|
|
351 #
|
|
352 @staticmethod
|
|
353 def getMd5SecureHash( inFile ):
|
|
354 if "hashlib" in sys.modules:
|
|
355 md5 = hashlib.md5()
|
|
356 inFileHandler = open( inFile, "r" )
|
|
357 while True:
|
|
358 line = inFileHandler.readline()
|
|
359 if line == "":
|
|
360 break
|
|
361 md5.update( line )
|
|
362 inFileHandler.close()
|
|
363 return md5.hexdigest()
|
|
364 else:
|
|
365 return ""
|
|
366
|
|
367 ## Return True if size file > 0 octet
|
|
368 #
|
|
369 # @param fileName string file name
|
|
370 #
|
|
371 @staticmethod
|
|
372 def isSizeNotNull(fileName):
|
|
373 size = os.path.getsize(fileName)
|
|
374 if size > 0:
|
|
375 return True
|
|
376 return False
|
|
377
|
|
378 ## Split one file into N Files by lines
|
|
379 #
|
|
380 # @param fileName string file name
|
|
381 # @param N int number of files to create
|
|
382 #
|
|
383 @staticmethod
|
|
384 def splitFileIntoNFiles(fileName, N):
|
|
385 nbLine = FileUtils.getNbLinesInSingleFile(fileName)
|
|
386 nbLinesInEachFile = nbLine
|
|
387 if N > nbLine:
|
|
388 N = nbLine
|
|
389 if N != 0:
|
|
390 nbLinesInEachFile = math.ceil(float(nbLine) / N)
|
|
391 else:
|
|
392 N = 1
|
|
393 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
|
|
394 fileHandler = open(fileName, "r")
|
|
395 for i in range(1,N+1):
|
|
396 with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:
|
|
397 j = 0
|
|
398 while j < nbLinesInEachFile:
|
|
399 j += 1
|
|
400 f.write(fileHandler.readline())
|
|
401 fileHandler.close()
|
|
402
|
|
403 ## Split one file into files of N lines
|
|
404 #
|
|
405 # @param fileName string input file name
|
|
406 # @param N int lines number per files
|
|
407 #
|
|
408 @staticmethod
|
|
409 def splitFileAccordingToLineNumber(fileName, N):
|
|
410 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
|
|
411 with open(fileName) as inF:
|
|
412 fileNb = 1
|
|
413 line = inF.readline()
|
|
414 if not line or N == 0:
|
|
415 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
|
|
416 f = open(outFileName, "wb")
|
|
417 shutil.copyfileobj(open(fileName, "rb"), f)
|
|
418 f.close()
|
|
419 else:
|
|
420 while line:
|
|
421 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
|
|
422 with open(outFileName, "w") as outF:
|
|
423 lineNb = 1
|
|
424 while lineNb <= N and line:
|
|
425 outF.write(line)
|
|
426 line = inF.readline()
|
|
427 lineNb += 1
|
|
428 fileNb += 1
|
|
429
|
|
430 ## Concatenates names from a list, using a given separator and a given extension.
|
|
431 #
|
|
432 # @param lNames list of file names
|
|
433 # @param sep separator used to join names
|
|
434 # @param ext extension of the return file name. If None, the most represented extension in lNames is used.
|
|
435 # If there is several, the first extension of theses several in alphabetical order is used
|
|
436 #
|
|
437 # @return concatName name concatenated
|
|
438 #
|
|
439 @staticmethod
|
|
440 def concatenateFileNamesFromList(lNames, sep = "_", ext = None):
|
|
441 concatName = ""
|
|
442 if lNames:
|
|
443 lNames.sort()
|
|
444 tBaseNames, tExt = zip(*[os.path.splitext(os.path.basename(name)) for name in lNames])
|
|
445
|
|
446 if ext is None:
|
|
447 dtExtToNb = {}
|
|
448 for extension in set(tExt):
|
|
449 dtExtToNb[extension] = tExt.count(extension)
|
|
450
|
|
451 items = sorted(dtExtToNb.items(), key = itemgetter(0))
|
|
452 items.sort(key = itemgetter(1), reverse = True)
|
|
453 ext = items[0][0]
|
|
454
|
|
455 if ext and ext[0] != '.':
|
|
456 ext = ".%s" % ext
|
|
457
|
|
458 concatName = "%s%s" % (sep.join(tBaseNames), ext)
|
|
459 return concatName
|
|
460
|
|
461 ## Concatenates names from a string, using a given separator and a given extension. Names are split from the string using splitSep
|
|
462 #
|
|
463 # @param filesNames list of file names
|
|
464 # @param splitSep separator used to split names from the input string
|
|
465 # @param joinSep separator used to join names
|
|
466 # @param ext extension of the return file name. If None, the most represented extension in lNames is used.
|
|
467 # If there is several, the first extension of theses several in alphabetical order is used
|
|
468 #
|
|
469 # @return concatName,lFilesNames name concatenated and split files list sorted alphabetically. Return original name if splitSep is empty.
|
|
470 #
|
|
471 @staticmethod
|
|
472 def concatenateFileNamesFromString(filesNames, splitSep = ",", joinSep = "_", ext = None):
|
|
473 if splitSep:
|
|
474 lFilesNames = filesNames.split(splitSep)
|
|
475 return FileUtils.concatenateFileNamesFromList(lFilesNames, joinSep, ext), lFilesNames
|
|
476 else:
|
|
477 print "WARNING: no split separator provided, returning input string"
|
|
478 return filesNames, [filesNames]
|
|
479 |