comparison commons/core/utils/FileUtils.py @ 38:2c0c0a89fad7

Uploaded
author m-zytnicki
date Thu, 02 May 2013 09:56:47 -0400
parents 44d5973c188c
children
comparison
equal deleted inserted replaced
37:d22fadc825e3 38:2c0c0a89fad7
1 # Copyright INRA (Institut National de la Recherche Agronomique)
2 # http://www.inra.fr
3 # http://urgi.versailles.inra.fr
4 #
5 # This software is governed by the CeCILL license under French law and
6 # abiding by the rules of distribution of free software. You can use,
7 # modify and/ or redistribute the software under the terms of the CeCILL
8 # license as circulated by CEA, CNRS and INRIA at the following URL
9 # "http://www.cecill.info".
10 #
11 # As a counterpart to the access to the source code and rights to copy,
12 # modify and redistribute granted by the license, users are provided only
13 # with a limited warranty and the software's author, the holder of the
14 # economic rights, and the successive licensors have only limited
15 # liability.
16 #
17 # In this respect, the user's attention is drawn to the risks associated
18 # with loading, using, modifying and/or developing or reproducing the
19 # software by the user in light of its specific status of free software,
20 # that may mean that it is complicated to manipulate, and that also
21 # therefore means that it is reserved for developers and experienced
22 # professionals having in-depth computer knowledge. Users are therefore
23 # encouraged to load and test the software's suitability as regards their
24 # requirements in conditions enabling the security of their systems and/or
25 # data to be ensured and, more generally, to use and operate it in the
26 # same conditions as regards security.
27 #
28 # The fact that you are presently reading this means that you have had
29 # knowledge of the CeCILL license and that you accept its terms.
30
31
32 import os
33 import glob
34 import shutil
35 import sys
36 import re
37 import math
38 try:
39 import hashlib
40 except:
41 pass
42
43
44 class FileUtils( object ):
45
46 ## Return the number of lines in the given file
47 #
48 def getNbLinesInSingleFile( fileName ):
49 fileHandler = open( fileName, "r" )
50 lines = fileHandler.readlines()
51 fileHandler.close()
52 if (len(lines)>0 and lines[-1]== "\n"):
53 return (len(lines)-1)
54 else :
55 return len(lines)
56
57 getNbLinesInSingleFile = staticmethod( getNbLinesInSingleFile )
58
59 ## Return the number of lines in the files in the given list
60 #
61 def getNbLinesInFileList( lFileNames ):
62 count = 0
63 for fileName in lFileNames:
64 count += FileUtils.getNbLinesInSingleFile( fileName )
65 return count
66
67 getNbLinesInFileList = staticmethod( getNbLinesInFileList )
68
69 ## Return True if the given file exists, False otherwise
70 #
71 def isRessourceExists( fileName ):
72 return os.path.exists( fileName )
73
74 isRessourceExists = staticmethod( isRessourceExists )
75
76 ## Return True if the given file is empty, False otherwise
77 #
78 def isEmpty( fileName ):
79 return 0 == FileUtils.getNbLinesInSingleFile( fileName )
80
81 isEmpty = staticmethod( isEmpty )
82
83 ## Return True if both files are identical, False otherwise
84 #
85 def are2FilesIdentical( file1, file2 ):
86 tmpFile = "diff_%s_%s" % ( os.path.basename(file1), os.path.basename(file2) )
87 cmd = "diff %s %s >> %s" % ( file1, file2, tmpFile )
88 returnStatus = os.system( cmd )
89 if returnStatus != 0:
90 print "WARNING: 'diff' returned '%i'" % returnStatus
91 os.remove( tmpFile )
92 return False
93 if FileUtils.isEmpty( tmpFile ):
94 os.remove( tmpFile )
95 return True
96 else:
97 os.remove( tmpFile )
98 return False
99
100 are2FilesIdentical = staticmethod( are2FilesIdentical )
101
102 ## Return a string with all the content of the files in the given list
103 #
104 def getFileContent( lFiles ):
105 content = ""
106 lFiles.sort()
107 for fileName in lFiles:
108 currentFile = open( fileName, "r" )
109 content += currentFile.read()
110 currentFile.close()
111 return content
112
113 getFileContent = staticmethod( getFileContent )
114
115 ## Save content of the given file after having sorted it
116 #
117 def sortFileContent( inFile, outFile="" ):
118 inFileHandler = open(inFile, "r" )
119 lines = inFileHandler.readlines()
120 inFileHandler.close()
121 lines.sort()
122 if outFile == "":
123 outFile = inFile
124 outFileHandler = open( outFile, "w" )
125 outFileHandler.writelines( lines )
126 outFileHandler.close()
127
128 sortFileContent = staticmethod( sortFileContent )
129
130 ## Add end-of-line symbol to the given file content if necessary
131 #
132 def addNewLineAtTheEndOfFileContent( fileContent ):
133 if not fileContent.endswith('\n') and len(fileContent) != 0:
134 fileContent += '\n'
135 return fileContent
136
137 addNewLineAtTheEndOfFileContent = staticmethod( addNewLineAtTheEndOfFileContent )
138
139 ## Concatenate files in the given list
140 #
141 def catFilesFromList( lFiles, outFile, sort=True, skipHeaders = False, separator = "" ):
142 if sort:
143 lFiles.sort()
144 outFileHandler = open( outFile, "a" )
145 isFirstFile = True
146 for singleFile in lFiles:
147 if not isFirstFile:
148 outFileHandler.write(separator)
149 isFirstFile = False
150 singleFileHandler = open( singleFile, "r" )
151 if skipHeaders:
152 singleFileHandler.readline()
153 line = singleFileHandler.readline()
154 while line:
155 outFileHandler.write(line)
156 line = singleFileHandler.readline()
157 singleFileHandler.close()
158 outFileHandler.close()
159
160 catFilesFromList = staticmethod( catFilesFromList )
161
162 ## Concatenate files according to the given pattern
163 #
164 def catFilesByPattern( pattern, outFile, skipHeaders = False, separator = "" ):
165 lFiles = glob.glob( pattern )
166 FileUtils.catFilesFromList( lFiles, outFile, skipHeaders = skipHeaders, separator = separator )
167
168 catFilesByPattern = staticmethod( catFilesByPattern )
169
170 ## Remove files listed according to the given pattern
171 #
172 # @example prefix="/home/tmp/dummy*.txt"
173 #
174 def removeFilesByPattern( prefix ):
175 lFiles = glob.glob( prefix )
176 for f in lFiles:
177 os.remove( f )
178
179 removeFilesByPattern = staticmethod( removeFilesByPattern )
180
181 ## Remove files listed according to the suffixes in the given list
182 #
183 def removeFilesBySuffixList( targetPath, lSuffixes ):
184 if targetPath[-1] == "/":
185 targetPath = targetPath[:-1]
186 for suffix in lSuffixes:
187 pattern = "%s/*%s" % ( targetPath, suffix )
188 FileUtils.removeFilesByPattern( pattern )
189
190 removeFilesBySuffixList = staticmethod( removeFilesBySuffixList )
191
192 ## Remove repeated blanks in the given file
193 #
194 def removeRepeatedBlanks( inFile, outFile="" ):
195 if outFile == "":
196 outFile = inFile
197 tmpFile = "tr_%s_%s" % ( inFile, outFile )
198 cmd = "tr -s ' ' < %s > %s" % ( inFile, tmpFile )
199 os.system( cmd )
200 os.rename( tmpFile, outFile )
201
202 removeRepeatedBlanks = staticmethod( removeRepeatedBlanks )
203
204 ## Remove files in the given list
205 #
206 @staticmethod
207 def removeFilesFromList(lFiles):
208 for f in lFiles:
209 os.remove(f)
210
211 ## Remove files in the given list if exist
212 #
213 @staticmethod
214 def removeFilesFromListIfExist(lFiles):
215 for fileName in lFiles:
216 if FileUtils.isRessourceExists(fileName):
217 os.remove(fileName)
218
219 ## Append the content of a file to another file
220 #
221 # @param inFile string name of the input file
222 # @param outFile string name of the output file
223 #
224 def appendFileContent( inFile, outFile ):
225 outFileHandler = open( outFile, "a" )
226 inFileHandler = open( inFile, "r" )
227 shutil.copyfileobj( inFileHandler, outFileHandler )
228 inFileHandler.close()
229 outFileHandler.close()
230
231 appendFileContent = staticmethod( appendFileContent )
232
233
234 ## Replace Windows end-of-line by Unix end-of-line
235 #
236 def fromWindowsToUnixEof( inFile ):
237 tmpFile = "%s.tmp" % ( inFile )
238 shutil.copyfile( inFile, tmpFile )
239 os.remove( inFile )
240 tmpFileHandler = open( tmpFile, "r" )
241 inFileHandler = open( inFile, "w" )
242 while True:
243 line = tmpFileHandler.readline()
244 if line == "":
245 break
246 inFileHandler.write( line.replace("\r\n","\n") )
247 tmpFileHandler.close()
248 inFileHandler.close()
249 os.remove( tmpFile )
250
251 fromWindowsToUnixEof = staticmethod( fromWindowsToUnixEof )
252
253
254 ## Remove duplicated lines in a file
255 #
256 # @note it preserves the initial order and handles blank lines
257 #
258 def removeDuplicatedLines( inFile ):
259 tmpFile = "%s.tmp" % ( inFile )
260 shutil.copyfile( inFile, tmpFile )
261 os.remove( inFile )
262
263 tmpFileHandler = open( tmpFile, "r" )
264 lLines = list( tmpFileHandler.read().split("\n") )
265 if lLines[-1] == "":
266 del lLines[-1]
267 sLines = set( lLines )
268 tmpFileHandler.close()
269 os.remove( tmpFile )
270
271 inFileHandler = open( inFile, "w" )
272 for line in lLines:
273 if line in sLines:
274 inFileHandler.write( "%s\n" % ( line ) )
275 sLines.remove( line )
276 inFileHandler.close()
277
278 removeDuplicatedLines = staticmethod( removeDuplicatedLines )
279
280
281 ## Write a list of lines in a given file
282 #
283 def writeLineListInFile( inFile, lLines ):
284 inFileHandler = open( inFile, "w" )
285 for line in lLines:
286 inFileHandler.write( line )
287 inFileHandler.close()
288
289 writeLineListInFile = staticmethod( writeLineListInFile )
290
291
292 ## Give the list of absolute path of each directory in the given directory
293 #
294 # @param rootPath string absolute path of the given directory
295 #
296 # @return lDirPath list of absolute directory path
297 #
298 def getAbsoluteDirectoryPathList(rootPath):
299 lDirPath = []
300 lPaths = glob.glob(rootPath + "/*")
301 for ressource in lPaths:
302 if os.path.isdir(ressource) :
303 lDirPath.append(ressource)
304 return lDirPath
305
306 getAbsoluteDirectoryPathList = staticmethod(getAbsoluteDirectoryPathList)
307
308
309 ## Get a sublist of which each element matches/doesn't match a pattern
310 #
311 # @param lPath string list of paths
312 #
313 # @param pattern string pattern
314 #
315 # @param match bool
316 #
317 # @return lPathMatching list of path matching pattern
318 #
319 def getSubListAccordingToPattern(lPath, pattern, match = True):
320 lPathMatching = []
321 for path in lPath:
322 if match:
323 if re.match(".*%s.*" % pattern, path):
324 lPathMatching.append(path)
325 else:
326 if not re.match(".*%s.*" % pattern, path):
327 lPathMatching.append(path)
328 return lPathMatching
329
330 getSubListAccordingToPattern = staticmethod(getSubListAccordingToPattern)
331
332
333 ## Give the list of file names found in the given directory
334 #
335 # @param dirPath string absolute path of the given directory
336 #
337 # @return lFilesInDir list of file names
338 #
339 def getFileNamesList( dirPath, patternFileFilter = ".*" ):
340 lFilesInDir = []
341 lPaths = glob.glob( dirPath + "/*" )
342 for ressource in lPaths:
343 if os.path.isfile( ressource ):
344 fileName = os.path.basename( ressource )
345 if re.match(patternFileFilter, fileName):
346 lFilesInDir.append( fileName )
347 return lFilesInDir
348
349 getFileNamesList = staticmethod( getFileNamesList )
350
351 ## Return the MD5 sum of a file
352 #
353 def getMd5SecureHash( inFile ):
354 if "hashlib" in sys.modules:
355 md5 = hashlib.md5()
356 inFileHandler = open( inFile, "r" )
357 while True:
358 line = inFileHandler.readline()
359 if line == "":
360 break
361 md5.update( line )
362 inFileHandler.close()
363 return md5.hexdigest()
364 else:
365 return ""
366
367 getMd5SecureHash = staticmethod( getMd5SecureHash )
368
369 ## Cat all files of a given directory
370 #
371 # @param dir string directory name
372 # @param outFileName string output file name
373 #
374 def catFilesOfDir(dir, outFileName):
375 lFiles = FileUtils.getFileNamesList(dir)
376 lFile2 = []
377 for file in lFiles:
378 lFile2.append(dir + "/" + file)
379 FileUtils.catFilesFromList(lFile2, outFileName)
380
381 catFilesOfDir = staticmethod(catFilesOfDir)
382
383 ## Return True if size file > 0 octet
384 #
385 # @param fileName string file name
386 #
387 def isSizeNotNull(fileName):
388 size = os.path.getsize(fileName)
389 if size > 0:
390 return True
391 return False
392
393 isSizeNotNull = staticmethod(isSizeNotNull)
394
395 ## Split one file into N Files by lines
396 #
397 # @param fileName string file name
398 # @param N int number of files to create
399 #
400 @staticmethod
401 def splitFileIntoNFiles(fileName, N):
402 nbLine = FileUtils.getNbLinesInSingleFile(fileName)
403 nbLinesInEachFile = nbLine
404 if N > nbLine:
405 N = nbLine
406 if N != 0:
407 nbLinesInEachFile = math.ceil(float(nbLine) / N)
408 else:
409 N = 1
410 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
411 fileHandler = open(fileName, "r")
412 for i in range(1,N+1):
413 with open("%s-%s%s" %(filePrefix, i, fileExt), "w") as f:
414 j = 0
415 while j < nbLinesInEachFile:
416 j += 1
417 f.write(fileHandler.readline())
418 fileHandler.close()
419
420 ## Split one file into files of N lines
421 #
422 # @param fileName string input file name
423 # @param N int lines number per files
424 #
425 @staticmethod
426 def splitFileAccordingToLineNumber(fileName, N):
427 filePrefix, fileExt = os.path.splitext(os.path.basename(fileName))
428 with open(fileName) as inF:
429 fileNb = 1
430 line = inF.readline()
431 if not line or N == 0:
432 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
433 f = open(outFileName, "wb")
434 shutil.copyfileobj(open(fileName, "rb"), f)
435 f.close()
436 else:
437 while line:
438 outFileName = "%s-%s%s" %(filePrefix, fileNb, fileExt)
439 with open(outFileName, "w") as outF:
440 lineNb = 1
441 while lineNb <= N and line:
442 outF.write(line)
443 line = inF.readline()
444 lineNb += 1
445 fileNb += 1