comparison TEisotools-1.1.a/commons/core/utils/FileUtils.py @ 16:836ce3d9d47a draft default tip

Uploaded
author urgi-team
date Thu, 21 Jul 2016 07:42:47 -0400
parents 255c852351c5
children
comparison
equal deleted inserted replaced
15:255c852351c5 16:836ce3d9d47a
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import re
34 import sys
35 import math
36 import glob
37 import shutil
38 import subprocess
39 from operator import itemgetter
40 try:
41 import hashlib
42 except:
43 pass
44
45
46 class FileUtils( object ):
47
48 ## Return the number of lines in the given file
49 #
50 @staticmethod
51 def getNbLinesInSingleFile( fileName ):
52 cmd = "wc -l %s" % fileName
53 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0]
54 nbLines = int(r.split()[0])
55
56 toAdd = 0
57 if nbLines:
58 cmd = "tail -1 %s" % fileName
59 r = subprocess.Popen(cmd.split(' '), stdout=subprocess.PIPE, stderr=subprocess.PIPE).communicate()[0]
60
61 if r == '\n':
62 toAdd -= 1
63 elif '\n' not in r:
64 toAdd += 1
65
66 return nbLines + toAdd
67
68 ## Return the number of lines in the files in the given list
69 #
70 @staticmethod
71 def getNbLinesInFileList( lFileNames ):
72 count = 0
73 for fileName in lFileNames:
74 count += FileUtils.getNbLinesInSingleFile( fileName )
75 return count
76
77 ## Return True if the given file exists, False otherwise
78 #
79 @staticmethod
80 def isRessourceExists( fileName ):
81 return os.path.exists( fileName )
82
83 ## Return True if the given file is empty, False otherwise
84 #
85 @staticmethod
86 def isEmpty( fileName ):
87 return 0 == FileUtils.getNbLinesInSingleFile( fileName )
88
89 ## Return True if both files are identical, False otherwise
90 #
91 @staticmethod
92 def are2FilesIdentical( file1, file2 ):
93 tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )
94 cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )
95 returnStatus = os.system( cmd )
96 if returnStatus != 0:
97 print "WARNING: 'diff' returned '%i'" % returnStatus
98 os.remove( tmpFile )
99 return False
100 if FileUtils.isEmpty( tmpFile ):
101 os.remove( tmpFile )
102 return True
103 else:
104 os.remove( tmpFile )
105 return False
106
107 ## Return a string with all the content of the files in the given list
108 #
109 @staticmethod
110 def getFileContent( lFiles ):
111 content = ""
112 lFiles.sort()
113 for fileName in lFiles:
114 currentFile = open( fileName, "r" )
115 content += currentFile.read()
116 currentFile.close()
117 return content
118
119 ## Save content of the given file after having sorted it
120 #
121 @staticmethod
122 def sortFileContent( inFile, outFile="" ):
123 inFileHandler = open(inFile, "r" )
124 lines = inFileHandler.readlines()
125 inFileHandler.close()
126 lines.sort()
127 if outFile == "":
128 outFile = inFile
129 outFileHandler = open( outFile, "w" )
130 outFileHandler.writelines( lines )
131 outFileHandler.close()
132
133 ## Add end-of-line symbol to the given file content if necessary
134 #
135 @staticmethod
136 def addNewLineAtTheEndOfFileContent( fileContent ):
137 if not fileContent.endswith('\n') and len(fileContent) != 0:
138 fileContent += '\n'
139 return fileContent
140
141 ## Concatenate files in the given list
142 #
143 @staticmethod
144 def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ):
145 if sort:
146 lFiles.sort()
147 outFileHandler = open( outFile, "a" )
148 isFirstFile = True
149 for singleFile in lFiles:
150 if not isFirstFile:
151 outFileHandler.write(separator)
152 isFirstFile = False
153 singleFileHandler = open( singleFile, "r" )
154 if skipHeaders:
155 singleFileHandler.readline()
156 line = singleFileHandler.readline()
157 while line:
158 outFileHandler.write(line)
159 line = singleFileHandler.readline()
160 singleFileHandler.close()
161 outFileHandler.close()
162
163 ## Concatenate files according to the given pattern
164 #
165 @staticmethod
166 def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ):
167 lFiles = glob.glob( pattern )
168 FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator )
169
170 ## Cat all files of a given directory
171 #
172 # @param dir string directory name
173 # @param outFileName string output file name
174 #
175 @staticmethod
176 def catFilesOfDir(directory, outFileName):
177 FileUtils.catFilesByPattern("%s/*" % directory, outFileName)
178
179 ## Remove files listed according to the given pattern
180 #
181 # @example prefix="/home/tmp/dummy*.txt"
182 #
183 @staticmethod
184 def removeFilesByPattern( prefix ):
185 lFiles = glob.glob( prefix )
186 for f in lFiles:
187 os.remove( f )
188
189 ## Remove files listed according to the suffixes in the given list
190 #
191 @staticmethod
192 def removeFilesBySuffixList( targetPath, lSuffixes ):
193 if targetPath[-1] == "/":
194 targetPath = targetPath[:-1]
195 for suffix in lSuffixes:
196 pattern = "%s/*%s" % ( targetPath, suffix )
197 FileUtils.removeFilesByPattern( pattern )
198
199 ## Remove repeated blanks in the given file
200 #
201 @staticmethod
202 def removeRepeatedBlanks( inFile, outFile="" ):
203 if outFile == "":
204 outFile = inFile
205 tmpFile = "tr_%s_%s" % ( inFile, outFile )
206 cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile )
207 os.system( cmd )
208 os.rename( tmpFile, outFile )
209
210 ## Remove files in the given list
211 #
212 @staticmethod
213 def removeFilesFromList(lFiles):
214 for f in lFiles:
215 os.remove(f)
216
217 ## Remove files in the given list if exist
218 #
219 @staticmethod
220 def removeFilesFromListIfExist(lFiles):
221 for fileName in lFiles:
222 if FileUtils.isRessourceExists(fileName):
223 os.remove(fileName)
224
225 ## Append the content of a file to another file
226 #
227 # @param inFile string name of the input file
228 # @param outFile string name of the output file
229 #
230 @staticmethod
231 def appendFileContent( inFile, outFile ):
232 outFileHandler = open( outFile, "a" )
233 inFileHandler = open( inFile, "r" )
234 shutil.copyfileobj( inFileHandler, outFileHandler )
235 inFileHandler.close()
236 outFileHandler.close()
237
238
239 ## Replace Windows end-of-line by Unix end-of-line
240 #
241 @staticmethod
242 def fromWindowsToUnixEof( inFile ):
243 tmpFile = "%s.tmp" % ( inFile )
244 shutil.copyfile( inFile, tmpFile )
245 os.remove( inFile )
246 tmpFileHandler = open( tmpFile, "r" )
247 inFileHandler = open( inFile, "w" )
248 while True:
249 line = tmpFileHandler.readline()
250 if line == "":
251 break
252 inFileHandler.write( line.replace("\r\n","\n") )
253 tmpFileHandler.close()
254 inFileHandler.close()
255 os.remove( tmpFile )
256
257
258 ## Remove duplicated lines in a file
259 #
260 # @note it preserves the initial order and handles blank lines
261 #
262 @staticmethod
263 def removeDuplicatedLines( inFile ):
264 tmpFile = "%s.tmp" % ( inFile )
265 shutil.copyfile( inFile, tmpFile )
266 os.remove( inFile )
267
268 tmpFileHandler = open( tmpFile, "r" )
269 lLines = list( tmpFileHandler.read().split("\n") )
270 if lLines[-1] == "":
271 del lLines[-1]
272 sLines = set( lLines )
273 tmpFileHandler.close()
274 os.remove( tmpFile )
275
276 inFileHandler = open( inFile, "w" )
277 for line in lLines:
278 if line in sLines:
279 inFileHandler.write( "%s\n" % ( line ) )
280 sLines.remove( line )
281 inFileHandler.close()
282
283
284 ## Write a list of lines in a given file
285 #
286 @staticmethod
287 def writeLineListInFile( inFile, lLines ):
288 inFileHandler = open( inFile, "w" )
289 for line in lLines:
290 inFileHandler.write( line )
291 inFileHandler.close()
292
293
294 ## Give the list of absolute path of each directory in the given directory
295 #
296 # @param rootPath string absolute path of the given directory
297 #
298 # @return lDirPath list of absolute directory path
299 #
300 @staticmethod
301 def getAbsoluteDirectoryPathList(rootPath):
302 lDirPath = []
303 lPaths = glob.glob(rootPath + "/*")
304 for ressource in lPaths:
305 if os.path.isdir(ressource) :
306 lDirPath.append(ressource)
307 return lDirPath
308
309
310 ## Get a sublist of which each element matches/doesn't match a pattern
311 #
312 # @param lPath string list of paths
313 #
314 # @param pattern string pattern
315 #
316 # @param match bool
317 #
318 # @return lPathMatching list of path matching pattern
319 #
320 @staticmethod
321 def getSubListAccordingToPattern(lPath, pattern, match = True):
322 lPathMatching = []
323 for path in lPath:
324 if match:
325 if re.match(".*%s.*" % pattern, path):
326 lPathMatching.append(path)
327 else:
328 if not re.match(".*%s.*" % pattern, path):
329 lPathMatching.append(path)
330 return lPathMatching
331
332
333 ## Give the list of file names found in the given directory
334 #
335 # @param dirPath string absolute path of the given directory
336 #
337 # @return lFilesInDir list of file names
338 #
339 @staticmethod
340 def getFileNamesList( dirPath, patternFileFilter = ".*" ):
341 lFilesInDir = []
342 lPaths = glob.glob( dirPath + "/*" )
343 for ressource in lPaths:
344 if os.path.isfile( ressource ):
345 fileName = os.path.basename( ressource )
346 if re.match(patternFileFilter, fileName):
347 lFilesInDir.append( fileName )
348 return lFilesInDir
349
350 ## Return the MD5 sum of a file
351 #
352 @staticmethod
353 def getMd5SecureHash( inFile ):
354 if "hashlib" in sys.modules:
355 md5 = hashlib.md5()
356 inFileHandler = open( inFile, "r" )
357 while True:
358 line = inFileHandler.readline()
359 if line == "":
360 break
361 md5.update( line )
362 inFileHandler.close()
363 return md5.hexdigest()
364 else:
365 return ""
366
367 ## Return True if size file > 0 octet
368 #
369 # @param fileName string file name
370 #
371 @staticmethod
372 def isSizeNotNull(fileName):
373 size = os.path.getsize(fileName)
374 if size > 0:
375 return True
376 return False
377
378 ## Split one file into N Files by lines
379 #
380 # @param fileName string file name
381 # @param N int number of files to create
382 #
383 @staticmethod
384 def splitFileIntoNFiles(fileName, N):
385 nbLine = FileUtils.getNbLinesInSingleFile(fileName)
386 nbLinesInEachFile = nbLine
387 if N > nbLine:
388 N = nbLine
389 if N != 0:
390 nbLinesInEachFile = math.ceil(float(nbLine) / N)
391 else:
392 N = 1
393 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
394 fileHandler = open(fileName, "r")
395 for i in range(1,N+1):
396 with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:
397 j = 0
398 while j < nbLinesInEachFile:
399 j += 1
400 f.write(fileHandler.readline())
401 fileHandler.close()
402
403 ## Split one file into files of N lines
404 #
405 # @param fileName string input file name
406 # @param N int lines number per files
407 #
408 @staticmethod
409 def splitFileAccordingToLineNumber(fileName, N):
410 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
411 with open(fileName) as inF:
412 fileNb = 1
413 line = inF.readline()
414 if not line or N == 0:
415 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
416 f = open(outFileName, "wb")
417 shutil.copyfileobj(open(fileName, "rb"), f)
418 f.close()
419 else:
420 while line:
421 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
422 with open(outFileName, "w") as outF:
423 lineNb = 1
424 while lineNb <= N and line:
425 outF.write(line)
426 line = inF.readline()
427 lineNb += 1
428 fileNb += 1
429
430 ## Concatenates names from a list, using a given separator and a given extension.
431 #
432 # @param lNames list of file names
433 # @param sep separator used to join names
434 # @param ext extension of the return file name. If None, the most represented extension in lNames is used.
435 # If there is several, the first extension of theses several in alphabetical order is used
436 #
437 # @return concatName name concatenated
438 #
439 @staticmethod
440 def concatenateFileNamesFromList(lNames, sep = "_", ext = None):
441 concatName = ""
442 if lNames:
443 lNames.sort()
444 tBaseNames, tExt = zip(*[os.path.splitext(os.path.basename(name)) for name in lNames])
445
446 if ext is None:
447 dtExtToNb = {}
448 for extension in set(tExt):
449 dtExtToNb[extension] = tExt.count(extension)
450
451 items = sorted(dtExtToNb.items(), key = itemgetter(0))
452 items.sort(key = itemgetter(1), reverse = True)
453 ext = items[0][0]
454
455 if ext and ext[0] != '.':
456 ext = ".%s" % ext
457
458 concatName = "%s%s" % (sep.join(tBaseNames), ext)
459 return concatName
460
461 ## Concatenates names from a string, using a given separator and a given extension. Names are split from the string using splitSep
462 #
463 # @param filesNames list of file names
464 # @param splitSep separator used to split names from the input string
465 # @param joinSep separator used to join names
466 # @param ext extension of the return file name. If None, the most represented extension in lNames is used.
467 # If there is several, the first extension of theses several in alphabetical order is used
468 #
469 # @return concatName,lFilesNames name concatenated and split files list sorted alphabetically. Return original name if splitSep is empty.
470 #
471 @staticmethod
472 def concatenateFileNamesFromString(filesNames, splitSep = ",", joinSep = "_", ext = None):
473 if splitSep:
474 lFilesNames = filesNames.split(splitSep)
475 return FileUtils.concatenateFileNamesFromList(lFilesNames, joinSep, ext), lFilesNames
476 else:
477 print "WARNING: no split separator provided, returning input string"
478 return filesNames, [filesNames]
479